A 包含列 x、y1、y2、y3 和 y4。 我有兴趣研究 x 的 {y1,y2,y3,y4} 序列。 为了找到每个 x 出现的唯一 {y1,y2,y3,y4} 序列,我执行以下操作:
B = pd.DataFrame()
for x_temp in A['x'].unique():
B = pd.concat([B, A[A['x'] == x_temp][['x','y1','y2','y3','y4']]])
B = B.drop_duplicates().sort_values(by=['x','y1','y2','y3','y4'])
del x_temp
我想在 B 中引入一个名为“count”的新列,其中包含 A 中特定 x 出现的唯一 {y1,y2,y3,y4} 的数量。
B['count'] = A.apply(lambda row: (A['y1'] == row['y1']) & (A['y2'] == row['y2']) & (A['y3'] == row['y3']) & (A['y4'] == row['y4']), axis=1).sum()
这可行,但是,如果 A 或 B 缺少值,则不起作用。我希望它将缺失值也视为唯一值。
示例:
A = pd.DataFrame({'x':['1','1','1','1','2','2','2','2','2','1'],
'y1':['1','2','2',np.nan,'2',np.nan,'2','2','2','1'],
'y2':['2','1','2','2',np.nan,np.nan,'2','2','1','2'],
'y3':['1','1',np.nan,'2',np.nan,'2',np.nan,np.nan,'1','1'],
'y4':['2','2','2',np.nan,np.nan,'1','2','2','2','2']})
B = pd.DataFrame()
for x_temp in A['x'].unique():
B = pd.concat([B, A[A['x'] == x_temp][['x','y1','y2','y3','y4']]])
B = B.drop_duplicates().sort_values(by=['x','y1','y2','y3','y4'])
del x_temp
B['count'] = A.apply(lambda row: (A['y1'] == row['y1']) & (A['y2'] == row['y2']) & (A['y3'] == row['y3']) & (A['y4'] == row['y4']), axis=1).sum()
print(B)
x y1 y2 y3 y4 count
0 1 1 2 1 2 2
1 1 2 1 1 2 2
2 1 2 2 NaN 2 0
3 1 NaN 2 2 NaN 0
8 2 2 1 1 2 2
6 2 2 2 NaN 2 0
4 2 2 NaN NaN NaN 0
5 2 NaN NaN 2 1 0
这样你就可以达到想要的效果。
B = A.drop_duplicates(['x', 'y1', 'y2', 'y3', 'y4']).assign(count=lambda df: df.groupby('x').transform('size')).fillna('missing')
您可以使用
notna()
和 isna()
,创建蒙版,然后 apply()
:
import pandas as pd
import numpy as np
def _unique(A):
B = pd.DataFrame()
for el in A['x'].unique():
B = pd.concat([B, A[A['x'] == el][['x', 'y1', 'y2', 'y3', 'y4']]])
B = B.drop_duplicates().sort_values(by=['x', 'y1', 'y2', 'y3', 'y4'])
def _mask(row):
y1_mask = A['y1'].fillna('missing') == row['y1'] if pd.notna(row['y1']) else A['y1'].isna()
y2_mask = A['y2'].fillna('missing') == row['y2'] if pd.notna(row['y2']) else A['y2'].isna()
y3_mask = A['y3'].fillna('missing') == row['y3'] if pd.notna(row['y3']) else A['y3'].isna()
y4_mask = A['y4'].fillna('missing') == row['y4'] if pd.notna(row['y4']) else A['y4'].isna()
x_mask = A['x'] == row['x']
return (x_mask & y1_mask & y2_mask & y3_mask & y4_mask).sum()
B['count'] = B.apply(_mask, axis=1)
return B
A = pd.DataFrame({
'x': [1, 1, 2, 2, 2],
'y1': [1, 1, 2, np.nan, 3],
'y2': [2, 2, np.nan, 3, 3],
'y3': [3, 3, 4, 4, 4],
'y4': [4, 4, 5, 5, np.nan]
})
print(_unique(A))
x y1 y2 y3 y4 count
0 1 1.0 2.0 3 4.0 2
2 2 2.0 NaN 4 5.0 1
4 2 3.0 3.0 4 NaN 1
3 2 NaN 3.0 4 5.0 1
假设您想要对值进行计数,然后获得不同 x 的总和,您可以使用:
cols = ['x','y1','y2','y3','y4']
out = (A[cols].value_counts(dropna=False, sort=False)
.reset_index(name='count')
.sort_values(by=cols, na_position='last')
.assign(count=lambda x: x.groupby(cols[1:], dropna=False)
['count'].transform('sum')
)
)
输出:
x y1 y2 y3 y4 count
0 1 1 2 1 2 2
1 1 2 1 1 2 2
2 1 2 2 NaN 2 3
3 1 NaN 2 2 NaN 1
4 2 2 1 1 2 2
5 2 2 2 NaN 2 3
6 2 2 NaN NaN NaN 1
7 2 NaN NaN 2 1 1
如果要将 NaN 的行设置为 0:
cols = ['x','y1','y2','y3','y4']
out = (A[cols].value_counts(dropna=False, sort=False)
.reset_index(name='count')
.sort_values(by=cols, na_position='last')
.assign(count=lambda x: x.groupby(cols[1:])
['count'].transform('sum')
.fillna(0).convert_dtypes()
)
)
输出:
x y1 y2 y3 y4 count
0 1 1 2 1 2 2
1 1 2 1 1 2 2
2 1 2 2 NaN 2 0
3 1 NaN 2 2 NaN 0
4 2 2 1 1 2 2
5 2 2 2 NaN 2 0
6 2 2 NaN NaN NaN 0
7 2 NaN NaN 2 1 0
如果您不想对 xs 求和:
cols = ['x','y1','y2','y3','y4']
out = (A[cols].value_counts(dropna=False, sort=False)
.reset_index(name='count')
.sort_values(by=cols, na_position='last')
)
输出:
x y1 y2 y3 y4 count
0 1 1 2 1 2 2
1 1 2 1 1 2 1
2 1 2 2 NaN 2 1
3 1 NaN 2 2 NaN 1
4 2 2 1 1 2 1
5 2 2 2 NaN 2 2
6 2 2 NaN NaN NaN 1
7 2 NaN NaN 2 1 1
尝试:
def count_duplicated(g):
vc = g.agg(tuple, axis=1).value_counts()
out = pd.DataFrame([[*k, v] for k, v in vc.items()], columns=[*g.columns, "count"])
return out
B = (
A.groupby("x")
.apply(count_duplicated, include_groups=False)
.droplevel(1)
.reset_index()
)
print(B)
打印:
x y1 y2 y3 y4 count
0 1 1 2 1 2 2
1 1 2 1 1 2 1
2 1 2 2 NaN 2 1
3 1 NaN 2 2 NaN 1
4 2 2 2 NaN 2 2
5 2 2 NaN NaN NaN 1
6 2 NaN NaN 2 1 1
7 2 2 1 1 2 1