我有一棵树,其中包含多个节点之间的链接。我还有一个包含 100 万行的数据框。树节点和数据帧列之间存在如下映射:
import networkx as nx
import pandas as pd
import numpy as np
g = nx.DiGraph()
g.add_edge('a', 'b')
g.add_edge('a', 'c')
g.add_edge('b', 'd')
g.add_edge('b', 'e')
g.add_edge('c', 'f')
g.add_edge('c', 'g')
nx.draw(g, with_labels=True)
df = pd.DataFrame({
"l1": "a",
"l2": ['b', 'c', 'b', 'c'],
'l3': ['d', 'f', 'e', 'g'],
'data': [1, 2, 3, 4]
})
id_to_col_map = {'a': 'l1', 'b': 'l2', 'c': 'l2', 'd': 'l3', 'e': 'l3', 'f': 'l3', 'g': 'l3'}
root_node_id = 'a'
现在,我需要找出数据框中每一行所采取的路径。这是我的解决方案:
leaf_cols = []
def helper(node_id, sub_df, parent_truthy, path=None):
path = path or []
new_col_name = f"matches_{','.join([*path, node_id])}"
col_name = id_to_col_map[node_id]
sub_df[new_col_name] = parent_truthy & (sub_df[col_name] == node_id)
successors = list(g.successors(node_id))
if len(successors) == 0:
leaf_cols.append(new_col_name)
else:
for successor in successors:
helper(successor, sub_df, sub_df[new_col_name], path=[*path, node_id])
truthy = np.ones(len(df), dtype=bool)
helper(root_node_id, df, truthy)
df["path"] = ""
for leaf_col in leaf_cols:
cond = (df["path"].str.len() == 0) & df[leaf_col]
df["path"] = np.where(
cond,
leaf_col.replace("matches_", ""),
df["path"]
)
此解决方案的工作原理如下:
True
。这帮助我确定
每行所走的路径。这个算法真的很慢。请帮我优化一下。
使用 BFS 而不是 DFS - 假设 g、df、id_to_col_map 和 root_node_id 的定义如示例中所示
import networkx as nx
import pandas as pd
import numpy as np
def optimize_tree_traversal(g, df, id_to_col_map, root_node_id):
# Step 1: Create a dictionary to store paths
paths = {root_node_id: [root_node_id]}
# Step 2: Perform BFS to get all paths
for node in nx.bfs_tree(g, root_node_id):
for successor in g.successors(node):
paths[successor] = paths[node] + [successor]
# Step 3: Create conditions for each path
conditions = {}
for leaf, path in paths.items():
if not list(g.successors(leaf)): # Check if it's a leaf node
condition = np.ones(len(df), dtype=bool)
for node, next_node in zip(path, path[1:] + [None]):
col = id_to_col_map[node]
condition &= (df[col] == node)
if next_node:
next_col = id_to_col_map[next_node]
condition &= (df[next_col] == next_node)
conditions[leaf] = condition
# Step 4: Assign paths to rows
df['path'] = ''
for leaf, condition in conditions.items():
df.loc[condition & (df['path'] == ''), 'path'] = ','.join(paths[leaf])
return df
df_result = optimize_tree_traversal(g, df, id_to_col_map, root_node_id)
性能对比
DFS:
Execution Time: 89 seconds
BFS:
Execution Time: 3.76 seconds