我有一本大字典,其中包含字符串列表作为值。这些字符串可以重复。我想将这本字典转换为键列表的列表。目标是将原始字典的键与唯一字符串相关联,以便稍后我可以在后续分析中轻松获取任何唯一字符串的所有键。
模拟功能如下:
import cProfile
import time
import string
import random
import collections
import numpy as np
from typing import Dict, List, Set
def test_large_dict():
char_pool: str = string.ascii_uppercase
# simulate a large dictionary with 300,001 keys, 10,000 unique strings.
test_dict: Dict[int, List[str]] = collections.defaultdict(list)
num_keys: int = 300001
str_nums: np.ndarray = np.random.randint(10, 300, size=num_keys)
str_len: List[int] = np.random.randint(7, 30, size=10000).tolist()
rand_strs: List[str] = [
''.join(random.choice(char_pool) for _ in range(str_len[i]))
for i in range(10000)
]
for i in range(1, num_keys):
# no. of strings in current value: str_nums[i])
test_dict[i] = [random.choice(rand_strs) for _ in range(str_nums[i])]
# In original dictionary
print(f"1: {test_dict[1]}")
print(f"2: {test_dict[2]}")
# ===========================================
# The block I want to optimise.
t = time.time()
str_index: Dict[str, int] = {}
key_lists: List[Set[int]] = []
k: int = 0
for ix, str_list in test_dict.items():
for s in str_list:
if s in str_index:
g = str_index[s]
key_lists[g].add(ix)
else:
str_index[s] = k
key_lists.append({ix})
k += 1
print(f"Elapsed time for the conversion: {time.time() - t:.2f} seconds.")
# ===========================================
if __name__ == '__main__':
cProfile.run("test_large_dict()", sort="tottime")
测试时间:
Elapsed time for the conversion: 24.77 seconds.
分析结果为:
355043899 function calls in 148.398 seconds
Ordered by: internal time
ncalls tottime percall cumtime percall filename:lineno(function)
46487731 37.495 0.000 109.065 0.000 random.py:341(choice)
46487731 35.055 0.000 56.797 0.000 random.py:242(_randbelow_with_getrandbits)
1 27.915 27.915 147.832 147.832 test_proteolysis.py:104(test_large_dict)
92975462 14.773 0.000 14.773 0.000 {built-in method builtins.len}
76097488 13.163 0.000 13.163 0.000 {method 'getrandbits' of '_random.Random' objects}
46297043 10.721 0.000 10.721 0.000 {method 'add' of 'set' objects}
46487731 8.579 0.000 8.579 0.000 {method 'bit_length' of 'int' objects}
1 0.566 0.566 148.398 148.398 <string>:1(<module>)
190688 0.076 0.000 0.476 0.000 test_proteolysis.py:112(<genexpr>)
10000 0.040 0.000 0.517 0.000 {method 'join' of 'str' objects}
2 0.005 0.003 0.005 0.003 {method 'randint' of 'numpy.random.mtrand.RandomState' objects}
10000 0.005 0.000 0.005 0.000 {method 'append' of 'list' objects}
3 0.005 0.002 0.005 0.002 {built-in method builtins.print}
1 0.000 0.000 0.000 0.000 {method 'tolist' of 'numpy.ndarray' objects}
2 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}
1 0.000 0.000 148.398 148.398 {built-in method builtins.exec}
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
2 0.000 0.000 0.000 0.000 fromnumeric.py:71(_wrapreduction)
2 0.000 0.000 0.000 0.000 fromnumeric.py:2979(prod)
2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}
2 0.000 0.000 0.000 0.000 {built-in method time.time}
3 0.000 0.000 0.000 0.000 {method 'items' of 'dict' objects}
2 0.000 0.000 0.000 0.000 fromnumeric.py:2974(_prod_dispatcher)
花费了大量时间将这本大字典解压成列表。有没有更有效的方法来解压大字典?我也可以自由地使用任何数据结构来实现目标。在提供的示例中,我生成了
dict
str_index
和 list
key_lists
。 str_index
的键是唯一的字符串,值是key_lists
中列表的索引,这样原始大字典中第i唯一字符串的所有键都放入key_lists
中的第i列表中,例如,第 i 个唯一字符串的所有键
sj
是key_lists[str_index[sj]]
。
所以如果我没记错的话,你会用
collections.Counter
走得更快。如果你有很多迭代,你必须避免set.add
我有一台旧机器,所以我用较低的num_keys
完成了我的测试
import cProfile
import time
import string
import random
import collections
import numpy as np
from typing import Dict, List, Set
from collections import Counter
def test_large_dict():
char_pool: str = string.ascii_uppercase
# simulate a large dictionary with 300,001 keys, 10,000 unique strings.
test_dict: Dict[int, List[str]] = collections.defaultdict(list)
num_keys: int = 30001
str_nums: np.ndarray = np.random.randint(10, 300, size=num_keys)
str_len: List[int] = np.random.randint(7, 30, size=10000).tolist()
rand_strs: List[str] = [
''.join(random.choices(char_pool, k=str_len[i]))
for i in range(10000)
]
for i in range(1, num_keys):
# no. of strings in current value: str_nums[i])
test_dict[i] = random.choices(rand_strs, k=str_nums[i])
# In original dictionary
# ~ print(f"1: {test_dict[1]}")
# ~ print(f"2: {test_dict[2]}")
# ===========================================
# The block I want to optimise.
t = time.time()
counters = collections.defaultdict(set)
for ix, str_list in test_dict.items():
for k, v in Counter(str_list).items():
if v > 1:
counters[k].add(ix)
str_index = {k: i for i, k in enumerate(counters.keys())}
key_lists = list(counters.values())
print(f"Elapsed time for the conversion: {time.time() - t:.2f} seconds.")
# ===========================================
if __name__ == '__main__':
cProfile.run("test_large_dict()", sort="tottime")
你的代码给了我
Elapsed time for the conversion: 4.13 seconds.
36733678 function calls in 18.621 seconds
还有我的
Elapsed time for the conversion: 1.25 seconds.
10027910 function calls (10027896 primitive calls) in 4.494 seconds