尝试将以下示例数据集分为 (2) 组:Case 和 Run,然后将其余列转换为列表。为什么要列出清单?
注意,我尝试在不直接进行循环/枚举的情况下执行此操作。
Case,Run,Timestep,Value0,Value1,ValueN
0,0,0,2.626105446,-0.340988725,1.459768689
0,0,1,-0.200319404,-1.709593368,-1.011965009
0,0,2,-2.433554966,-1.558093989,0.44943261
0,1,0,0.02353679,0.280785593,-0.238524231
0,1,1,0.203162019,-1.502525578,1.40292514
0,1,2,-0.037211304,-0.319558867,-1.322124136
0,2,0,-1.45331218,-0.20914855,0.276561536
0,2,1,-0.304656195,-1.594559675,0.264025112
0,2,2,0.030118014,-0.712998507,-0.044159056
1,0,0,0.237803452,0.153455073,-1.084628161
1,0,1,0.814585507,-0.825123856,0.165566792
1,0,2,0.362165592,-0.12731861,-0.572814142
1,1,0,-0.49709788,-0.499310228,0.082185424
1,1,1,0.326764013,0.914642226,-0.96102063
1,1,2,-0.43253699,0.280958492,0.098690723
1,2,0,-0.854096958,0.559531068,0.547050394
1,2,1,0.331055857,1.021091422,0.032419058
1,2,2,0.712204394,-2.050207709,-0.507117825
2,0,0,-0.545231265,1.112460127,0.156334193
2,0,1,-1.507139756,1.296971255,-1.247736656
2,0,2,-0.661400305,0.50431061,1.24670996
2,1,0,-0.215960487,-0.788252945,0.481096264
2,1,1,-1.233657743,0.315172941,0.862535335
2,1,2,0.371303046,0.770596557,2.017713093
2,2,0,0.282822836,-0.332043452,1.107596241
2,2,1,0.085487262,1.104947912,0.545943326
2,2,2,0.864801017,1.082528278,0.386386756
理想情况下,我的最终结果将类似于 MxNxD,其中 D = [TimeStep, Value0, Value1, ValueN]。因此,如果
arr = data
那么:
arr[0]
= Case 0内所有
n_runs
的嵌套列表
arr[0][0]
= 运行 0 case 0 的所有样本的嵌套列表
arr[0][0][0]
= 来自运行 0 案例 0 的 Sample0 (1x4)
所需输出:
[[[[0.0, 2.626105446, -0.340988725, 1.459768689], [1.0, -0.200319404, -1.709593368, -1.011965009], [2.0, -2.433554966, -1.558093989, 0.44943261]], [[0.0, 0.02353679, 0.280785593, -0.238524231], [1.0, 0.203162019, -1.502525578, 1.40292514], [2.0, -0.037211304, -0.319558867, -1.322124136]], [[0.0, -1.45331218, -0.20914855, 0.276561536], [1.0, -0.304656195, -1.594559675, 0.264025112], [2.0, 0.030118014, -0.712998507, -0.044159056]]], [[[0.0, 2.626105446, -0.340988725, 1.459768689], [1.0, -0.200319404, -1.709593368, -1.011965009], [2.0, -2.433554966, -1.558093989, 0.44943261]], [[0.0, 0.02353679, 0.280785593, -0.238524231], [1.0, 0.203162019, -1.502525578, 1.40292514], [2.0, -0.037211304, -0.319558867, -1.322124136]], [[0.0, -1.45331218, -0.20914855, 0.276561536], [1.0, -0.304656195, -1.594559675, 0.264025112], [2.0, 0.030118014, -0.712998507, -0.044159056]]], [[[0.0, 2.626105446, -0.340988725, 1.459768689], [1.0, -0.200319404, -1.709593368, -1.011965009], [2.0, -2.433554966, -1.558093989, 0.44943261]], [[0.0, 0.02353679, 0.280785593, -0.238524231], [1.0, 0.203162019, -1.502525578, 1.40292514], [2.0, -0.037211304, -0.319558867, -1.322124136]], [[0.0, -1.45331218, -0.20914855, 0.276561536], [1.0, -0.304656195, -1.594559675, 0.264025112], [2.0, 0.030118014, -0.712998507, -0.044159056]]]]
当前输出(更新):代码:
import pandas as pd
import numpy as np
import json
COL_NAMES = ['Case', 'Run', 'Timestep', 'Value0', 'Value1', 'ValueN']
case = COL_NAMES[0]
run = COL_NAMES[1]
timestep = COL_NAMES[2]
filepath = 'sample.csv'
with open (filepath, newline='') as raw_data_file:
df = pd.read_csv(raw_data_file, header=0)
nested_list = []
for cases, group in df.groupby([case])[COL_NAMES[1:]]:
nested_list.append([])
for runs, group in group.groupby([run])[COL_NAMES[2:]]:
nested_list[cases].append([])
samples = group.to_numpy().tolist()
nested_list[cases][runs] = samples
data = nested_list
with open("sample.json", "w") as output:
json.dump((data), output)
输出:
[[[[0.0, 2.626105446, -0.340988725, 1.459768689], [1.0, -0.200319404, -1.709593368, -1.011965009], [2.0, -2.433554966, -1.558093989, 0.44943261]], [[0.0, 0.02353679, 0.280785593, -0.238524231], [1.0, 0.203162019, -1.502525578, 1.40292514], [2.0, -0.037211304, -0.319558867, -1.322124136]], [[0.0, -1.45331218, -0.20914855, 0.276561536], [1.0, -0.304656195, -1.594559675, 0.264025112], [2.0, 0.030118014, -0.712998507, -0.044159056]]], [[[0.0, 0.237803452, 0.153455073, -1.084628161], [1.0, 0.814585507, -0.825123856, 0.165566792], [2.0, 0.362165592, -0.12731861, -0.572814142]], [[0.0, -0.49709788, -0.499310228, 0.082185424], [1.0, 0.326764013, 0.914642226, -0.96102063], [2.0, -0.43253699, 0.280958492, 0.098690723]], [[0.0, -0.854096958, 0.559531068, 0.547050394], [1.0, 0.331055857, 1.021091422, 0.032419058], [2.0, 0.712204394, -2.050207709, -0.507117825]]], [[[0.0, -0.545231265, 1.112460127, 0.156334193], [1.0, -1.507139756, 1.296971255, -1.247736656], [2.0, -0.661400305, 0.50431061, 1.24670996]], [[0.0, -0.215960487, -0.788252945, 0.481096264], [1.0, -1.233657743, 0.315172941, 0.862535335], [2.0, 0.371303046, 0.770596557, 2.017713093]], [[0.0, 0.282822836, -0.332043452, 1.107596241], [1.0, 0.085487262, 1.104947912, 0.545943326], [2.0, 0.864801017, 1.082528278, 0.386386756]]]]
groupby
columns = df.columns.drop(['Case','Run'])
data = df.groupby(['Case']).apply(
lambda df: df.groupby('Run')[columns].apply(pd.Series.tolist)
).to_numpy().tolist()
在递归中应用groupby
groupby(level=0)
)对数据进行分组,并对删除了
level=0
的每个组应用相同的逻辑,会怎么样?在最后一步我们只需应用
pd.Series.tolist
:
tolist = pd.Series.tolist
def curry(df):
return [
curry(gr.droplevel(0)) if gr.index.nlevels > 1 else tolist(gr)
for _, gr in df.groupby(level=0)
]
data = curry(df.set_index(['Case', 'Run']))
这就是我们如何重写代码以可视化 3 级索引上的递归:
g = df.set_index(['Case','Run','Timestep'])
data = \
[[[tolist(g) for _, g in g.groupby(level=2)]
for _, g in g.groupby(level=1)]
for _, g in g.groupby(level=0)]
为了解决原来的问题,我们将停留在 2 个级别:
g = df.set_index(['Case','Run'])
data = [[tolist(g) for _, g in g.groupby(level=1)]
for _, g in g.groupby(level=0)]