Python:高效解压大型字典,将字符串列表作为键列表列表的值

问题描述 投票:0回答:1

我有一本大字典,其中包含字符串列表作为值。这些字符串可以重复。我想将这本字典转换为键列表的列表。目标是将原始字典的键与唯一字符串相关联,以便稍后我可以在后续分析中轻松获取任何唯一字符串的所有键

模拟功能如下:

import cProfile
import time
import string
import random
import collections
import numpy as np
from typing import Dict, List, Set


def test_large_dict():
    char_pool: str = string.ascii_uppercase
    # simulate a large dictionary with 300,001 keys, 10,000 unique strings.
    test_dict: Dict[int, List[str]] = collections.defaultdict(list)
    num_keys: int = 300001
    str_nums: np.ndarray = np.random.randint(10, 300, size=num_keys)
    str_len: List[int] = np.random.randint(7, 30, size=10000).tolist()
    rand_strs: List[str] = [
        ''.join(random.choice(char_pool) for _ in range(str_len[i]))
        for i in range(10000)
    ]
    for i in range(1, num_keys):
        # no. of strings in current value: str_nums[i])
        test_dict[i] = [random.choice(rand_strs) for _ in range(str_nums[i])]

    # In original dictionary
    print(f"1: {test_dict[1]}")
    print(f"2: {test_dict[2]}")

    # ===========================================
    # The block I want to optimise.
    t = time.time()
    str_index: Dict[str, int] = {}
    key_lists: List[Set[int]] = []
    k: int = 0
    for ix, str_list in test_dict.items():
        for s in str_list:
            if s in str_index:
                g = str_index[s]
                key_lists[g].add(ix)
            else:
                str_index[s] = k
                key_lists.append({ix})
                k += 1
    print(f"Elapsed time for the conversion: {time.time() - t:.2f} seconds.")
    # ===========================================


if __name__ == '__main__':
    cProfile.run("test_large_dict()", sort="tottime")

测试时间:

Elapsed time for the conversion: 24.77 seconds.

分析结果为:

355043899 function calls in 148.398 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 46487731   37.495    0.000  109.065    0.000 random.py:341(choice)
 46487731   35.055    0.000   56.797    0.000 random.py:242(_randbelow_with_getrandbits)
        1   27.915   27.915  147.832  147.832 test_proteolysis.py:104(test_large_dict)
 92975462   14.773    0.000   14.773    0.000 {built-in method builtins.len}
 76097488   13.163    0.000   13.163    0.000 {method 'getrandbits' of '_random.Random' objects}
 46297043   10.721    0.000   10.721    0.000 {method 'add' of 'set' objects}
 46487731    8.579    0.000    8.579    0.000 {method 'bit_length' of 'int' objects}
        1    0.566    0.566  148.398  148.398 <string>:1(<module>)
   190688    0.076    0.000    0.476    0.000 test_proteolysis.py:112(<genexpr>)
    10000    0.040    0.000    0.517    0.000 {method 'join' of 'str' objects}
        2    0.005    0.003    0.005    0.003 {method 'randint' of 'numpy.random.mtrand.RandomState' objects}
    10000    0.005    0.000    0.005    0.000 {method 'append' of 'list' objects}
        3    0.005    0.002    0.005    0.002 {built-in method builtins.print}
        1    0.000    0.000    0.000    0.000 {method 'tolist' of 'numpy.ndarray' objects}
        2    0.000    0.000    0.000    0.000 {method 'reduce' of 'numpy.ufunc' objects}
        1    0.000    0.000  148.398  148.398 {built-in method builtins.exec}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
        2    0.000    0.000    0.000    0.000 fromnumeric.py:71(_wrapreduction)
        2    0.000    0.000    0.000    0.000 fromnumeric.py:2979(prod)
        2    0.000    0.000    0.000    0.000 {built-in method builtins.getattr}
        2    0.000    0.000    0.000    0.000 {built-in method time.time}
        3    0.000    0.000    0.000    0.000 {method 'items' of 'dict' objects}
        2    0.000    0.000    0.000    0.000 fromnumeric.py:2974(_prod_dispatcher)

花费了大量时间将这本大字典解压成列表。有没有更有效的方法来解压大字典?我也可以自由地使用任何数据结构来实现目标。在提供的示例中,我生成了

dict
str_index
list
key_lists
str_index
的键是唯一的字符串,值是
key_lists
中列表的索引,这样原始大字典中第i唯一字符串的所有键都放入key_lists中的第
i
列表中,例如,第 i 个唯一字符串的所有键
sj
key_lists[str_index[sj]]

python
1个回答
0
投票

所以如果我没记错的话,你会用

collections.Counter
走得更快。如果你有很多迭代,你必须避免
set.add
我有一台旧机器,所以我用较低的
num_keys

完成了我的测试
import cProfile
import time
import string
import random
import collections
import numpy as np
from typing import Dict, List, Set
from collections import Counter


def test_large_dict():
    char_pool: str = string.ascii_uppercase
    # simulate a large dictionary with 300,001 keys, 10,000 unique strings.
    test_dict: Dict[int, List[str]] = collections.defaultdict(list)
    num_keys: int = 30001
    str_nums: np.ndarray = np.random.randint(10, 300, size=num_keys)
    str_len: List[int] = np.random.randint(7, 30, size=10000).tolist()
    rand_strs: List[str] = [
        ''.join(random.choices(char_pool, k=str_len[i]))
        for i in range(10000)
    ]
    for i in range(1, num_keys):
        # no. of strings in current value: str_nums[i])
        test_dict[i] = random.choices(rand_strs, k=str_nums[i])

    # In original dictionary
    # ~ print(f"1: {test_dict[1]}")
    # ~ print(f"2: {test_dict[2]}")

    # ===========================================
    # The block I want to optimise.
    t = time.time()
    counters = collections.defaultdict(set)
    for ix, str_list in test_dict.items():
        for k, v in Counter(str_list).items():
            if v > 1:
                counters[k].add(ix)

    str_index = {k: i for i, k in enumerate(counters.keys())}
    key_lists = list(counters.values())
    print(f"Elapsed time for the conversion: {time.time() - t:.2f} seconds.")
    # ===========================================


if __name__ == '__main__':
    cProfile.run("test_large_dict()", sort="tottime")

你的代码给了我

Elapsed time for the conversion: 4.13 seconds.
         36733678 function calls in 18.621 seconds

还有我的

Elapsed time for the conversion: 1.25 seconds.
         10027910 function calls (10027896 primitive calls) in 4.494 seconds
© www.soinside.com 2019 - 2024. All rights reserved.