我有一个数据框,需要从数据框列的所有可能组合中按两列进行分组 ['A','B','C','D','E','F','G']
import pandas as pd
d = {'A': [0,1,1,0,0,1,0,0],
'B': [1,1,0,0,0,1,0,1],
'C': [0,0,1,1,0,0,1,0],
'D': [1,1,1,0,0,1,0,0],
'E': [0,0,0,1,0,0,0,1],
'F': [0,0,1,1,0,0,1,0],
'G': [1,0,1,1,0,0,0,0],
'Feature_1':[1,0,1,1,0,1,1,0],
'Feature_2':[0,1,1,0,1,0,0,1]}
df = pd.DataFrame(d)
print(df)
A B C D E F G Feature_1 Feature_2
0 0 1 0 1 0 0 1 1 0
1 1 1 0 1 0 0 0 0 1
2 1 0 1 1 0 1 1 1 1
3 0 0 1 0 1 1 1 1 0
4 0 0 0 0 0 0 0 0 1
5 1 1 0 1 0 0 0 1 0
6 0 0 1 0 0 1 0 1 0
7 0 1 0 0 1 0 0 0 1
需要对长度为 2 的列的所有可能组合进行分组
有没有一种方法可以向量化这个过程,如果数据帧列相对较大,则需要很长时间:
预期输出为:
Col_1 Col_2 Col_1_value Col_2_value Feature_1_Sum Feature_2_Sum
0 A B 0 0 2 1
1 A B 0 1 1 1
2 A B 1 0 1 1
3 A B 1 1 1 1
0 A C 0 0 1 2
... ... ... ... ... ... ...
3 E G 1 1 1 0
0 F G 0 0 1 3
1 F G 0 1 1 0
2 F G 1 0 1 0
3 F G 1 1 2 1
77 rows × 6 columns
我通过循环所做的事情:
from itertools import combinations
cols_list = ['A','B','C','D','E','F','G']
i=0
for comb in combinations(cols_list, 2):
i=i+1
comb_list = [comb[0], comb[1]]
try:
del df_gp
except Exception as e:
pass
try:
del df_comb
except Exception as e:
pass
df_gp = (df.groupby(by=comb_list).agg(Feature_1_Sum=('Feature_1','sum'), Feature_2_Sum= ('Feature_2','sum')).reset_index())
df_gp['Col_1'] = comb[0]
df_gp['Col_2'] = comb[1]
df_gp['Col_1_value'] = df_gp[comb[0]]
df_gp['Col_2_value'] = df_gp[comb[1]]
df_comb = pd.DataFrame()
cols = ['Col_1','Col_2','Col_1_value','Col_2_value','Feature_1_Sum','Feature_2_Sum']
df_comb = df_gp[cols]
if i==1:
df_agg = df_comb
else:
df_agg = pd.concat([df_agg, df_comb])
df_agg
这依赖于 numpy 和 pandas - 构建所有列,然后调用一次 groupby。
itertools.combinations
确实会生成很多列,因此根据数据/计算机内存的大小,您可能会收到 OOM 错误。
import pandas as pd
import numpy as np
from itertools import chain, combinations
cols_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
features = ['Feature_1','Feature_2']
agg_features = [f"{col}_sum" for col in features]
combo = enumerate(cols_list)
combo = zip(*combo)
combo = zip(*combo)
combo = combinations(combo, 2)
combo = enumerate(combo)
combo = (((*a,num),(*b,num)) for num, (a,b) in combo)
combo = [*chain.from_iterable(combo)]
combo = pd.MultiIndex.from_tuples(combo)
# build two dataframes, one with cols_list, the other containing just the features
# we blow the columns horizontally,
dff = df.loc[:, 'A':'G']
columns = pd.MultiIndex.from_arrays([range(dff.columns.size), dff.columns])
dff.columns = columns
dff=dff.reindex(combo.droplevel(-1),axis=1)
feat1 = np.tile(['Feature_1','Feature_2'],len(combo)//2)
feat2 = np.repeat(range(len(combo)//2),2)
feat3 = [feat1,feat2]
feat3 = pd.MultiIndex.from_arrays(feat3)
features=(df
.loc[:, features]
.reindex(feat3.droplevel(1),axis=1)
)
# reshape, using numpy and recombine into one dataframe
# you can combine the numpy arrays
# and create a DataFrame from the concatenated array
f1=dff.to_numpy().reshape((-1,2),order='C')
f1=pd.DataFrame(f1,columns=['col_value1','col2_value'])
f2=dff.columns.get_level_values(1).to_numpy()
f2=np.reshape(f2, (-1,2),order='C')
f2=np.tile(f2, (len(df),1))
f2=pd.DataFrame(f2,columns=['col1','col2'])
f3=features.to_numpy().reshape((-1,2),order='C')
f3=pd.DataFrame(f3,columns=['Feature_1_sum','Feature_2_sum'])
out=pd.concat([f2,f1,f3],axis=1)
# groupby and compute result
group_columns = ['col1','col2','col_value1','col2_value']
outcome = (out
.groupby(group_columns,sort=False,as_index=False)
.sum()
# you can ignore this
.sort_values(group_columns,ignore_index=True)
)
print(outcome)
col1 col2 col_value1 col2_value Feature_1_sum Feature_2_sum
0 A B 0 0 2 1
1 A B 0 1 1 1
2 A B 1 0 1 1
3 A B 1 1 1 1
4 A C 0 0 1 2
.. ... ... ... ... ... ...
72 E G 1 1 1 0
73 F G 0 0 1 3
74 F G 0 1 1 0
75 F G 1 0 1 0
76 F G 1 1 2 1
[77 rows x 6 columns]