我正在尝试从这里进行双向方差分析的分析,这与我的数据案例类似。使用提供的代码似乎一切正常,但是当我从以下代码实现res.tukey_hsd时
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from bioinfokit.analys import stat
d = pd.read_csv("https://reneshbedre.github.io/assets/posts/anova/twowayanova.txt", sep="\t")
d_melt = pd.melt(d, id_vars=['Genotype'], value_vars=['1_year', '2_year', '3_year'])
# replace column names
d_melt.columns = ['Genotype', 'years', 'value']
d_melt.head()
# perform multiple pairwise comparison (Tukey HSD)
# unequal sample size data, tukey_hsd uses Tukey-Kramer test
res = stat()
# for main effect Genotype
res.tukey_hsd(df=d_melt, res_var='value', xfac_var='Genotype', anova_model='value~C(Genotype)+C(years)+C(Genotype):C(years)')
res.tukey_summary
我收到以下错误
TypeError: Could not convert ['AAAAAAAAA' '1_year1_year1_year2_year2_year2_year3_year3_year3_year'] to numeric
我正在请求帮助。代码有什么问题以及如何正确执行它?
错误的更多细节是
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
File ~\anaconda3\Lib\site-packages\pandas\core\nanops.py:1680, in _ensure_numeric(x)
1679 try:
-> 1680 x = x.astype(np.complex128)
1681 except (TypeError, ValueError):
ValueError: complex() arg is a malformed string
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
File ~\anaconda3\Lib\site-packages\pandas\core\nanops.py:1683, in _ensure_numeric(x)
1682 try:
-> 1683 x = x.astype(np.float64)
1684 except ValueError as err:
1685 # GH#29941 we get here with object arrays containing strs
ValueError: could not convert string to float: 'AAAAAAAAA'
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
Cell In[6], line 17
15 res = stat()
16 # for main effect Genotype
---> 17 res.tukey_hsd(df=d_melt, res_var='value', xfac_var='Genotype', anova_model='value~C(Genotype)+C(years)+C(Genotype):C(years)')
18 res.tukey_summary
File ~\anaconda3\Lib\site-packages\bioinfokit\analys.py:882, in stat.tukey_hsd(self, df, res_var, xfac_var, anova_model, phalpha, ss_typ)
878 group_pval = dict()
879 # group_let = dict()
880 # share_let = dict()
--> 882 mult_group, mult_group_count, sample_size_r = analys_general.get_list_from_df(df, xfac_var, res_var, 'get_dict')
884 # self.anova_stat(df, res_var, anova_xfac_var)
885 self.anova_stat(df, anova_model, ss_typ)
File ~\anaconda3\Lib\site-packages\bioinfokit\analys.py:421, in analys_general.get_list_from_df(df, xfac_var, res_var, funct)
419 df_counts += 1
420 elif funct == 'get_dict':
--> 421 mult_group[ele] = df[df[xfac_var] == ele].mean().loc[res_var]
422 mult_group_count[ele] = df[df[xfac_var] == ele].shape[0]
423 elif isinstance(xfac_var, list) and len(xfac_var) > 3:
File ~\anaconda3\Lib\site-packages\pandas\core\generic.py:11556, in NDFrame._add_numeric_operations.<locals>.mean(self, axis, skipna, numeric_only, **kwargs)
11539 @doc(
11540 _num_doc,
11541 desc="Return the mean of the values over the requested axis.",
(...)
11554 **kwargs,
11555 ):
> 11556 return NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)
File ~\anaconda3\Lib\site-packages\pandas\core\generic.py:11201, in NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)
11194 def mean(
11195 self,
11196 axis: Axis | None = 0,
(...)
11199 **kwargs,
11200 ) -> Series | float:
> 11201 return self._stat_function(
11202 "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs
11203 )
File ~\anaconda3\Lib\site-packages\pandas\core\generic.py:11158, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
11154 nv.validate_stat_func((), kwargs, fname=name)
11156 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 11158 return self._reduce(
11159 func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
11160 )
File ~\anaconda3\Lib\site-packages\pandas\core\frame.py:10519, in DataFrame._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
10515 df = df.T
10517 # After possibly _get_data and transposing, we are now in the
10518 # simple case where we can use BlockManager.reduce
> 10519 res = df._mgr.reduce(blk_func)
10520 out = df._constructor(res).iloc[0]
10521 if out_dtype is not None:
File ~\anaconda3\Lib\site-packages\pandas\core\internals\managers.py:1534, in BlockManager.reduce(self, func)
1532 res_blocks: list[Block] = []
1533 for blk in self.blocks:
-> 1534 nbs = blk.reduce(func)
1535 res_blocks.extend(nbs)
1537 index = Index([None]) # placeholder
File ~\anaconda3\Lib\site-packages\pandas\core\internals\blocks.py:339, in Block.reduce(self, func)
333 @final
334 def reduce(self, func) -> list[Block]:
335 # We will apply the function and reshape the result into a single-row
336 # Block with the same mgr_locs; squeezing will be done at a higher level
337 assert self.ndim == 2
--> 339 result = func(self.values)
341 if self.values.ndim == 1:
342 # TODO(EA2D): special case not needed with 2D EAs
343 res_values = np.array([[result]])
File ~\anaconda3\Lib\site-packages\pandas\core\frame.py:10482, in DataFrame._reduce.<locals>.blk_func(values, axis)
10480 return values._reduce(name, skipna=skipna, **kwds)
10481 else:
> 10482 return op(values, axis=axis, skipna=skipna, **kwds)
File ~\anaconda3\Lib\site-packages\pandas\core\nanops.py:96, in disallow.__call__.<locals>._f(*args, **kwargs)
94 try:
95 with np.errstate(invalid="ignore"):
---> 96 return f(*args, **kwargs)
97 except ValueError as e:
98 # we want to transform an object array
99 # ValueError message to the more typical TypeError
100 # e.g. this is normally a disallowed function on
101 # object arrays that contain strings
102 if is_object_dtype(args[0]):
File ~\anaconda3\Lib\site-packages\pandas\core\nanops.py:158, in bottleneck_switch.__call__.<locals>.f(values, axis, skipna, **kwds)
156 result = alt(values, axis=axis, skipna=skipna, **kwds)
157 else:
--> 158 result = alt(values, axis=axis, skipna=skipna, **kwds)
160 return result
File ~\anaconda3\Lib\site-packages\pandas\core\nanops.py:421, in _datetimelike_compat.<locals>.new_func(values, axis, skipna, mask, **kwargs)
418 if datetimelike and mask is None:
419 mask = isna(values)
--> 421 result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
423 if datetimelike:
424 result = _wrap_results(result, orig_values.dtype, fill_value=iNaT)
File ~\anaconda3\Lib\site-packages\pandas\core\nanops.py:727, in nanmean(values, axis, skipna, mask)
724 dtype_count = dtype
726 count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
--> 727 the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
729 if axis is not None and getattr(the_sum, "ndim", False):
730 count = cast(np.ndarray, count)
File ~\anaconda3\Lib\site-packages\pandas\core\nanops.py:1686, in _ensure_numeric(x)
1683 x = x.astype(np.float64)
1684 except ValueError as err:
1685 # GH#29941 we get here with object arrays containing strs
-> 1686 raise TypeError(f"Could not convert {x} to numeric") from err
1687 else:
1688 if not np.any(np.imag(x)):
TypeError: Could not convert ['AAAAAAAAA' '1_year1_year1_year2_year2_year2_year3_year3_year3_year'] to numeric
检查 DataFrame 列的数据类型。您可以使用 DataFrame 的 dtypes 属性来完成此操作:
print(d_melt.dtypes)
您还可以将值列转换为数字类型,非数字值将转换为 Nan。
d_melt['value'] = pd.to_numeric(d_melt['value'], errors='coerce')