在Python中查找由跨多个列的值组成的唯一序列的计数

问题描述 投票:0回答:4

A 包含列 x、y1、y2、y3 和 y4。 我有兴趣研究 x 的 {y1,y2,y3,y4} 序列。 为了找到每个 x 出现的唯一 {y1,y2,y3,y4} 序列,我执行以下操作:

B = pd.DataFrame()
for x_temp in A['x'].unique():
    B = pd.concat([B, A[A['x'] == x_temp][['x','y1','y2','y3','y4']]])
B = B.drop_duplicates().sort_values(by=['x','y1','y2','y3','y4'])
del x_temp 

我想在 B 中引入一个名为“count”的新列,其中包含 A 中特定 x 出现的唯一 {y1,y2,y3,y4} 的数量。

B['count'] = A.apply(lambda row: (A['y1'] == row['y1']) & (A['y2'] == row['y2']) & (A['y3'] == row['y3']) & (A['y4'] == row['y4']), axis=1).sum()

这可行,但是,如果 A 或 B 缺少值,则不起作用。我希望它将缺失值也视为唯一值。

示例:

A = pd.DataFrame({'x':['1','1','1','1','2','2','2','2','2','1'],
                   'y1':['1','2','2',np.nan,'2',np.nan,'2','2','2','1'],
                   'y2':['2','1','2','2',np.nan,np.nan,'2','2','1','2'],
                   'y3':['1','1',np.nan,'2',np.nan,'2',np.nan,np.nan,'1','1'],
                   'y4':['2','2','2',np.nan,np.nan,'1','2','2','2','2']})

B = pd.DataFrame()
for x_temp in A['x'].unique():
    B = pd.concat([B, A[A['x'] == x_temp][['x','y1','y2','y3','y4']]])
B = B.drop_duplicates().sort_values(by=['x','y1','y2','y3','y4'])
del x_temp

B['count'] = A.apply(lambda row: (A['y1'] == row['y1']) & (A['y2'] == row['y2']) & (A['y3'] == row['y3']) & (A['y4'] == row['y4']), axis=1).sum()
print(B)

   x   y1   y2   y3   y4  count
0  1    1    2    1    2      2
1  1    2    1    1    2      2
2  1    2    2  NaN    2      0
3  1  NaN    2    2  NaN      0
8  2    2    1    1    2      2
6  2    2    2  NaN    2      0
4  2    2  NaN  NaN  NaN      0
5  2  NaN  NaN    2    1      0
python pandas dataframe
4个回答
0
投票

这样你就可以达到想要的效果。

B = A.drop_duplicates(['x', 'y1', 'y2', 'y3', 'y4']).assign(count=lambda df: df.groupby('x').transform('size')).fillna('missing')

0
投票

您可以使用

notna()
isna()
,创建蒙版,然后
apply()
:

import pandas as pd
import numpy as np

def _unique(A):
    B = pd.DataFrame()
    for el in A['x'].unique():
        B = pd.concat([B, A[A['x'] == el][['x', 'y1', 'y2', 'y3', 'y4']]])

    B = B.drop_duplicates().sort_values(by=['x', 'y1', 'y2', 'y3', 'y4'])

    def _mask(row):
        y1_mask = A['y1'].fillna('missing') == row['y1'] if pd.notna(row['y1']) else A['y1'].isna()
        y2_mask = A['y2'].fillna('missing') == row['y2'] if pd.notna(row['y2']) else A['y2'].isna()
        y3_mask = A['y3'].fillna('missing') == row['y3'] if pd.notna(row['y3']) else A['y3'].isna()
        y4_mask = A['y4'].fillna('missing') == row['y4'] if pd.notna(row['y4']) else A['y4'].isna()
        x_mask = A['x'] == row['x']

        return (x_mask & y1_mask & y2_mask & y3_mask & y4_mask).sum()

    B['count'] = B.apply(_mask, axis=1)

    return B


A = pd.DataFrame({
    'x': [1, 1, 2, 2, 2],
    'y1': [1, 1, 2, np.nan, 3],
    'y2': [2, 2, np.nan, 3, 3],
    'y3': [3, 3, 4, 4, 4],
    'y4': [4, 4, 5, 5, np.nan]
})

print(_unique(A))

打印

   x   y1   y2  y3   y4  count
0  1  1.0  2.0   3  4.0      2
2  2  2.0  NaN   4  5.0      1
4  2  3.0  3.0   4  NaN      1
3  2  NaN  3.0   4  5.0      1

0
投票

假设您想要对值进行计数,然后获得不同 x 的总和,您可以使用:

cols = ['x','y1','y2','y3','y4']

out = (A[cols].value_counts(dropna=False, sort=False)
       .reset_index(name='count')
       .sort_values(by=cols, na_position='last')
       .assign(count=lambda x: x.groupby(cols[1:], dropna=False)
                               ['count'].transform('sum')
              )
      )

输出:

   x   y1   y2   y3   y4  count
0  1    1    2    1    2      2
1  1    2    1    1    2      2
2  1    2    2  NaN    2      3
3  1  NaN    2    2  NaN      1
4  2    2    1    1    2      2
5  2    2    2  NaN    2      3
6  2    2  NaN  NaN  NaN      1
7  2  NaN  NaN    2    1      1

如果要将 NaN 的行设置为 0:

cols = ['x','y1','y2','y3','y4']

out = (A[cols].value_counts(dropna=False, sort=False)
       .reset_index(name='count')
       .sort_values(by=cols, na_position='last')
       .assign(count=lambda x: x.groupby(cols[1:])
                               ['count'].transform('sum')
                               .fillna(0).convert_dtypes()
              )
      )

输出:

   x   y1   y2   y3   y4  count
0  1    1    2    1    2      2
1  1    2    1    1    2      2
2  1    2    2  NaN    2      0
3  1  NaN    2    2  NaN      0
4  2    2    1    1    2      2
5  2    2    2  NaN    2      0
6  2    2  NaN  NaN  NaN      0
7  2  NaN  NaN    2    1      0

如果您不想对 xs 求和:

cols = ['x','y1','y2','y3','y4']

out = (A[cols].value_counts(dropna=False, sort=False)
       .reset_index(name='count')
       .sort_values(by=cols, na_position='last')
      )

输出:

   x   y1   y2   y3   y4  count
0  1    1    2    1    2      2
1  1    2    1    1    2      1
2  1    2    2  NaN    2      1
3  1  NaN    2    2  NaN      1
4  2    2    1    1    2      1
5  2    2    2  NaN    2      2
6  2    2  NaN  NaN  NaN      1
7  2  NaN  NaN    2    1      1

0
投票

尝试:

def count_duplicated(g):
    vc = g.agg(tuple, axis=1).value_counts()
    out = pd.DataFrame([[*k, v] for k, v in vc.items()], columns=[*g.columns, "count"])
    return out


B = (
    A.groupby("x")
    .apply(count_duplicated, include_groups=False)
    .droplevel(1)
    .reset_index()
)
print(B)

打印:

   x   y1   y2   y3   y4  count
0  1    1    2    1    2      2
1  1    2    1    1    2      1
2  1    2    2  NaN    2      1
3  1  NaN    2    2  NaN      1
4  2    2    2  NaN    2      2
5  2    2  NaN  NaN  NaN      1
6  2  NaN  NaN    2    1      1
7  2    2    1    1    2      1
© www.soinside.com 2019 - 2024. All rights reserved.