发现自己陷入了一个有趣的难题,因为我需要根据参与者 ID 计算一系列列表的频率。数据看起来与以下类似:
test_table_orig = pa.table([
pa.array(["a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "c", "c", "c", "c", "c", "d", "d", "d", "d", "e", "e", "e", "e", "e", "f", "f", "f", "f", "f", "f"]),
pa.array([[1,1,1,1], [2,0,1,2], [3,2,1,0], [4,3,2,1], [4,3,2,1], [1,2,3,4], [1,2,3,4], [1,2,3,4], [1,2,3,4], [1,2,3,4], [5,4,3,2], [5,4,3,2], [5,4,3,2], [5,4,3,2], [4,3,2,1], [6,5,4,3], [6,5,4,3], [8,7,6,5], [9,8,7,6], [7,6,5,4], [7,6,5,4], [7,6,5,4], [7,6,5,4], [10,11,12,13], [11,12,13,14], [12,13,14,15], [33,44,55,66], [22,33,44,55], [55,66,77,88], [22,33,44,55]])
], names=["ID", "ListData"])
不幸的是,聚合函数(例如,
test_table_orig.group_by(['ID','ListData']).aggregate([('ListData','count')])
)
>>> test_table_orig.group_by(['ID','ListData']).aggregate([('ListData','count')]).to_pandas()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "pyarrow/table.pxi", line 5498, in pyarrow.lib.TableGroupBy.aggregate
File "/home/usr/Python/mambaforge/lib/python3.10/site-packages/pyarrow/acero.py", line 308, in _group_by
return decl.to_table(use_threads=use_threads)
File "pyarrow/_acero.pyx", line 511, in pyarrow._acero.Declaration.to_table
File "pyarrow/error.pxi", line 154, in pyarrow.lib.pyarrow_internal_check_status
File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status
pyarrow.lib.ArrowNotImplementedError: Keys of type list<item: int64>
似乎不适用于列表数据类型。
但是,如果数据是字符串:
test_table_string = pa.table([
pa.array(["a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "c", "c", "c", "c", "c", "d", "d", "d", "d", "e", "e", "e", "e", "e", "f", "f", "f", "f", "f", "f"]),
pa.array(["[1,1,1,1]", "[2,0,1,2]", "[3,2,1,0]", "[4,3,2,1]", "[4,3,2,1]", "[1,2,3,4]", "[1,2,3,4]", "[1,2,3,4]", "[1,2,3,4]", "[1,2,3,4]", "[5,4,3,2]", "[5,4,3,2]", "[5,4,3,2]", "[5,4,3,2]", "[4,3,2,1]", "[6,5,4,3]", "[6,5,4,3]", "[8,7,6,5]", "[9,8,7,6]", "[7,6,5,4]", "[7,6,5,4]", "[7,6,5,4]", "[7,6,5,4]", "[10,11,12,13]", "[11,12,13,14]", "[12,13,14,15]", "[33,44,55,66]", "[22,33,44,55]", "[55,66,77,88]", "[22,33,44,55]"])
], names=["ID", "ListData"])
我能够得到我真正需要的东西:
test_table_string.group_by(['ID','ListData']).aggregate([('ListData','count')]).to_pandas()
ID ListData ListData_count
0 a [1,1,1,1] 1
1 a [2,0,1,2] 1
2 a [3,2,1,0] 1
3 a [4,3,2,1] 2
4 b [1,2,3,4] 5
5 c [5,4,3,2] 4
6 c [4,3,2,1] 1
7 d [6,5,4,3] 2
8 d [8,7,6,5] 1
9 d [9,8,7,6] 1
10 e [7,6,5,4] 4
11 e [10,11,12,13] 1
12 f [11,12,13,14] 1
13 f [12,13,14,15] 1
14 f [33,44,55,66] 1
15 f [22,33,44,55] 2
16 f [55,66,77,88] 1
每个单独的列表都有超过 120 个值,每个数字最多 12 个字符左右(每个列表总共约 2.4k 个字符),所以我不确定这是否可能。
有人知道获得列表数据类型这样的摘要的好方法吗?
或者知道一种好方法(或者如果可能的话)将这种类型的数据重新转换为字符串?
如有任何建议,我们将不胜感激。
这看起来与“应用操作/功能...”相同。固定大小的列表列可以旋转,然后按所有列进行分组。
columns = {f'c{i}': pc.list_element(test_table_orig['ListData'], i) for i in range(4)}
pa.table({'ID': test_table_orig['ID']} | columns)