矩阵
sim = np.array([[1,5,6,2],[7,10,3,4],[1,5,3,7]])
pairwise_best_hit(sim)
[[1, 2, 0], [1, 3, 2]]
这里是仅使用numpy的候选人:
import numpy as np
import pandas as pd
import timeit
sim = np.array([[1,5,6,2],[7,10,3,4],[1,5,3,7]])
def mine(sim):
out = []
copy = sim.copy()
MIN = np.iinfo(copy.dtype).min
for _ in range(min(copy.shape)):
ij = np.unravel_index(copy.argmax(), copy.shape)
out.append(ij)
copy[ij[0]] = MIN
copy[:,ij[1]] = MIN
return np.transpose(out)
def yours(sim):
xdim,ydim = np.meshgrid(np.arange(sim.shape[1]),np.arange(sim.shape[0]))
table = np.vstack((sim.ravel(),xdim.ravel(),ydim.ravel())).T
df = pd.DataFrame(table).rename(columns={0:'sim',1:'index2',2:'index1'}).sort_values('sim',ascending=False)
seq1_hits = []
seq2_hits = []
while len(df):
index1 = df.iloc[0]['index1']
index2 = df.iloc[0]['index2']
seq1_hits.append(index1)
seq2_hits.append(index2)
df = df[(df['index1']!=index1)&(df['index2']!=index2)]
return [seq1_hits,seq2_hits]
assert np.all(mine(sim) == yours(sim))
%timeit yours(sim)
# 1.05 ms ± 6.78 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
%timeit mine(sim)
# 8.18 µs ± 19.4 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)