我有这个数据框。
import pandas as pd
x = {
"year": ["2012", "2012", "2013", "2014", "2012", "2014", "2013", "2013", "2012", "2013", "2012", "2014", "2014", "2013", "2012", "2014"],
"class": ["A", "B", "C", "A", "C", "B", "B", "C", "A", "C", "B", "C", "A", "C", "B", "A"],
"gender": ["M", "F", "F", "M", "F", "M", "M", "F", "F", "F", "M", "M", "F", "M", "F", "F"],
"score1": ["6", "6", "8", "10", "6", "7", "6", "7", "8", "7", "10", "9", "9", "9", "8", "9"],
"score2": ["5", "9", "10", "5", "10", "9", "5", "7", "8", "9", "8", "8", "5", "5", "8", "5"],
"score3": ["5", "9", "9", "7", "8", "5", "9", "5", "7", "6", "5", "10", "8", "8", "6", "8"],
"score4": ["10", "8", "8", "10", "9", "8", "10", "9", "7", "8", "10", "9", "7", "7", "10", "7"]
}
data = pd.DataFrame(x)
我想用
dtypes = 'int64'
找到每列的中位数。然后我在 df 上按 class
列进行分组。
data.groupby('class').median()
但它显示错误。
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1490, in GroupBy._cython_agg_general..array_func(values)
1489 try:
-> 1490 result = self.grouper._cython_operation(
1491 "aggregate",
1492 values,
1493 how,
1494 axis=data.ndim - 1,
1495 min_count=min_count,
1496 **kwargs,
1497 )
1498 except NotImplementedError:
1499 # generally if we have numeric_only=False
1500 # and non-applicable functions
1501 # try to python agg
1502 # TODO: shouldn't min_count matter?
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:959, in BaseGrouper._cython_operation(self, kind, values, how, axis, min_count, **kwargs)
958 ngroups = self.ngroups
--> 959 return cy_op.cython_operation(
960 values=values,
961 axis=axis,
962 min_count=min_count,
963 comp_ids=ids,
964 ngroups=ngroups,
965 **kwargs,
966 )
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:657, in WrappedCythonOp.cython_operation(self, values, axis, min_count, comp_ids, ngroups, **kwargs)
649 return self._ea_wrap_cython_operation(
650 values,
651 min_count=min_count,
(...)
654 **kwargs,
655 )
--> 657 return self._cython_op_ndim_compat(
658 values,
659 min_count=min_count,
660 ngroups=ngroups,
661 comp_ids=comp_ids,
662 mask=None,
663 **kwargs,
664 )
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:497, in WrappedCythonOp._cython_op_ndim_compat(self, values, min_count, ngroups, comp_ids, mask, result_mask, **kwargs)
495 return res.T
--> 497 return self._call_cython_op(
498 values,
499 min_count=min_count,
500 ngroups=ngroups,
501 comp_ids=comp_ids,
502 mask=mask,
503 result_mask=result_mask,
504 **kwargs,
505 )
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:541, in WrappedCythonOp._call_cython_op(self, values, min_count, ngroups, comp_ids, mask, result_mask, **kwargs)
540 out_shape = self._get_output_shape(ngroups, values)
--> 541 func = self._get_cython_function(self.kind, self.how, values.dtype, is_numeric)
542 values = self._get_cython_vals(values)
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:167, in WrappedCythonOp._get_cython_function(cls, kind, how, dtype, is_numeric)
165 if how in ["median", "cumprod"]:
166 # no fused types -> no __signatures__
--> 167 raise NotImplementedError(
168 f"function is not implemented for this dtype: "
169 f"[how->{how},dtype->{dtype_str}]"
170 )
171 if "object" not in f.__signatures__:
172 # raise NotImplementedError here rather than TypeError later
NotImplementedError: function is not implemented for this dtype: [how->median,dtype->object]
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\nanops.py:786, in nanmedian(values, axis, skipna, mask)
785 try:
--> 786 values = values.astype("f8")
787 except ValueError as err:
788 # e.g. "could not convert string to float: 'a'"
ValueError: could not convert string to float: 'M'
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
Cell In[135], line 1
----> 1 data.groupby('class').median()
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1883, in GroupBy.median(self, numeric_only)
1862 @final
1863 def median(self, numeric_only: bool = False):
1864 """
1865 Compute median of groups, excluding missing values.
1866
(...)
1881 Median of values within each group.
1882 """
-> 1883 result = self._cython_agg_general(
1884 "median",
1885 alt=lambda x: Series(x).median(numeric_only=numeric_only),
1886 numeric_only=numeric_only,
1887 )
1888 return result.__finalize__(self.obj, method="groupby")
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1507, in GroupBy._cython_agg_general(self, how, alt, numeric_only, min_count, **kwargs)
1503 result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)
1505 return result
-> 1507 new_mgr = data.grouped_reduce(array_func)
1508 res = self._wrap_agged_manager(new_mgr)
1509 out = self._wrap_aggregated_output(res)
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\internals\managers.py:1503, in BlockManager.grouped_reduce(self, func)
1499 if blk.is_object:
1500 # split on object-dtype blocks bc some columns may raise
1501 # while others do not.
1502 for sb in blk._split():
-> 1503 applied = sb.apply(func)
1504 result_blocks = extend_blocks(applied, result_blocks)
1505 else:
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\internals\blocks.py:329, in Block.apply(self, func, **kwargs)
323 @final
324 def apply(self, func, **kwargs) -> list[Block]:
325 """
326 apply the function to my values; return a block if we are not
327 one
328 """
--> 329 result = func(self.values, **kwargs)
331 return self._split_op_result(result)
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1503, in GroupBy._cython_agg_general..array_func(values)
1490 result = self.grouper._cython_operation(
1491 "aggregate",
1492 values,
(...)
1496 **kwargs,
1497 )
1498 except NotImplementedError:
1499 # generally if we have numeric_only=False
1500 # and non-applicable functions
1501 # try to python agg
1502 # TODO: shouldn't min_count matter?
-> 1503 result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)
1505 return result
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1457, in GroupBy._agg_py_fallback(self, values, ndim, alt)
1452 ser = df.iloc[:, 0]
1454 # We do not get here with UDFs, so we know that our dtype
1455 # should always be preserved by the implemented aggregations
1456 # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?
-> 1457 res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
1459 if isinstance(values, Categorical):
1460 # Because we only get here with known dtype-preserving
1461 # reductions, we cast back to Categorical.
1462 # TODO: if we ever get "rank" working, exclude it here.
1463 res_values = type(values)._from_sequence(res_values, dtype=values.dtype)
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:994, in BaseGrouper.agg_series(self, obj, func, preserve_dtype)
987 if len(obj) > 0 and not isinstance(obj._values, np.ndarray):
988 # we can preserve a little bit more aggressively with EA dtype
989 # because maybe_cast_pointwise_result will do a try/except
990 # with _from_sequence. NB we are assuming here that _from_sequence
991 # is sufficiently strict that it casts appropriately.
992 preserve_dtype = True
--> 994 result = self._aggregate_series_pure_python(obj, func)
996 npvalues = lib.maybe_convert_objects(result, try_float=False)
997 if preserve_dtype:
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:1015, in BaseGrouper._aggregate_series_pure_python(self, obj, func)
1012 splitter = self._get_splitter(obj, axis=0)
1014 for i, group in enumerate(splitter):
-> 1015 res = func(group)
1016 res = libreduction.extract_result(res)
1018 if not initialized:
1019 # We only do this validation on the first iteration
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1885, in GroupBy.median..(x)
1862 @final
1863 def median(self, numeric_only: bool = False):
1864 """
1865 Compute median of groups, excluding missing values.
1866
(...)
1881 Median of values within each group.
1882 """
1883 result = self._cython_agg_general(
1884 "median",
-> 1885 alt=lambda x: Series(x).median(numeric_only=numeric_only),
1886 numeric_only=numeric_only,
1887 )
1888 return result.__finalize__(self.obj, method="groupby")
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\generic.py:11623, in NDFrame._add_numeric_operations..median(self, axis, skipna, numeric_only, **kwargs)
11606 @doc(
11607 _num_doc,
11608 desc="Return the median of the values over the requested axis.",
(...)
11621 **kwargs,
11622 ):
> 11623 return NDFrame.median(self, axis, skipna, numeric_only, **kwargs)
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\generic.py:11212, in NDFrame.median(self, axis, skipna, numeric_only, **kwargs)
11205 def median(
11206 self,
11207 axis: Axis | None = 0,
(...)
11210 **kwargs,
11211 ) -> Series | float:
> 11212 return self._stat_function(
11213 "median", nanops.nanmedian, axis, skipna, numeric_only, **kwargs
11214 )
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\generic.py:11158, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
11154 nv.validate_stat_func((), kwargs, fname=name)
11156 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 11158 return self._reduce(
11159 func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
11160 )
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\series.py:4670, in Series._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
4665 raise TypeError(
4666 f"Series.{name} does not allow {kwd_name}={numeric_only} "
4667 "with non-numeric dtypes."
4668 )
4669 with np.errstate(all="ignore"):
-> 4670 return op(delegate, skipna=skipna, **kwds)
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\nanops.py:158, in bottleneck_switch.__call__..f(values, axis, skipna, **kwds)
156 result = alt(values, axis=axis, skipna=skipna, **kwds)
157 else:
--> 158 result = alt(values, axis=axis, skipna=skipna, **kwds)
160 return result
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\nanops.py:789, in nanmedian(values, axis, skipna, mask)
786 values = values.astype("f8")
787 except ValueError as err:
788 # e.g. "could not convert string to float: 'a'"
--> 789 raise TypeError(str(err)) from err
790 if mask is not None:
791 values[mask] = np.nan
TypeError: could not convert string to float: 'M'
从上面的错误框中可以看出,
groupby
对gender
列进行了聚合。但是当我在 YouTube 上看到有人使用相同的数据框和相同的代码执行此操作时,一切都很好并且没有显示错误。
所以问题是:
groupby
上遗漏了什么吗?DataFrame 中的
score1, score2, score3
和 coree4
列导致的问题 ID 存储为字符串,而不是数字类型。这样做
import pandas as pd
x = {
"year": ["2012", "2012", "2013", "2014", "2012", "2014", "2013", "2013", "2012", "2013", "2012", "2014", "2014", "2013", "2012", "2014"],
"class": ["A", "B", "C", "A", "C", "B", "B", "C", "A", "C", "B", "C", "A", "C", "B", "A"],
"gender": ["M", "F", "F", "M", "F", "M", "M", "F", "F", "F", "M", "M", "F", "M", "F", "F"],
"score1": ["6", "6", "8", "10", "6", "7", "6", "7", "8", "7", "10", "9", "9", "9", "8", "9"],
"score2": ["5", "9", "10", "5", "10", "9", "5", "7", "8", "9", "8", "8", "5", "5", "8", "5"],
"score3": ["5", "9", "9", "7", "8", "5", "9", "5", "7", "6", "5", "10", "8", "8", "6", "8"],
"score4": ["10", "8", "8", "10", "9", "8", "10", "9", "7", "8", "10", "9", "7", "7", "10", "7"]
}
data = pd.DataFrame(x)
data[["score1", "score2", "score3", "score4"]] = data[["score1", "score2", "score3", "score4"]].apply(pd.to_numeric)
numeric_cols = data.select_dtypes(include='number')
result = numeric_cols.join(data[['class']]).groupby('class').median()
print(result)
这给出了
score1 score2 score3 score4
class
A 9.0 5.0 7.0 7.0
B 7.0 8.0 6.0 10.0
C 7.5 8.5 8.0 8.5