我有一个 DataFrame,其中有一列发布者名称,其中包含同一发布者的各种细微变化。例如,“Harlequin Romance”、“Harlequin Blaze”、“Harlequin Superromance”和“Harlequin”等条目均指同一出版商“Harlequin”。同样,“Hackett Publishing Company Inc.”、“Hackett Publ. Co Inc.”和“Hackett Publishing Company Inc. (USA)”应标准化为单一名称,例如“Hackett Publishing Company Inc.”。
考虑到 DataFrame 的大小(超过 10,000 行)和唯一发布者的数量(超过 100 个),手动创建映射字典是不切实际的。我需要一种动态方法来自动识别和标准化这些相似的发布者名称。
我正想尝试,但我不知道如何去做
import pandas as pd
from rapidfuzz import fuzz
import hdbscan
# Sample DataFrame
data = {
'publisher': [
'Harlequin Romance', 'Harlequin Blaze', 'Harlequin Superromance', 'Harlequin',
'Hackett Publishing Company Inc.', 'Hackett Publ. Co Inc', 'Hackett Publishing Company Inc. (USA)'
]
}
df = pd.DataFrame(data)
# Step 1: Compute the similarity matrix using RapidFuzz
def compute_similarity_matrix(names):
n = len(names)
similarity_matrix = [[0] * n for _ in range(n)]
for i in range(n):
for j in range(i, n):
similarity = fuzz.ratio(names[i], names[j])
similarity_matrix[i][j] = similarity
similarity_matrix[j][i] = similarity
return similarity_matrix
publishers = df['publisher'].tolist()
similarity_matrix = compute_similarity_matrix(publishers)
# Step 2: Convert similarity matrix to a distance matrix (1 - similarity)
distance_matrix = 1 - (pd.DataFrame(similarity_matrix) / 100)
# Step 3: Cluster similar names using HDBSCAN
clusterer = hdbscan.HDBSCAN(metric='precomputed', min_cluster_size=2)
cluster_labels = clusterer.fit_predict(distance_matrix)
# Step 4: Create a mapping of publishers to their cluster exemplar
clusters = {}
for label in set(cluster_labels):
if label != -1: # Ignore noise points
cluster_members = [publishers[i] for i in range(len(cluster_labels)) if cluster_labels[i] == label]
exemplar = max(cluster_members, key=lambda name: sum(fuzz.ratio(name, other) for other in cluster_members))
for member in cluster_members:
clusters[member] = exemplar
# Step 5: Apply the mapping to standardize publisher names
df['standardized_publisher'] = df['publisher'].map(clusters)
print(df)