我有对某些数据运行 KMeans 算法的代码,但我现在需要它来计算 Dunn 指数和惯性,但由于该程序的限制是 numpy、matplotlib 和 csv,所以没有在线视频显示如何计算 Dunn 指数只有这几个库,我不太喜欢数学,所以将实际的数学应用到代码中对我来说太难了......
我在网上搜索了如何利用 numpy 的限制在 python 中计算 dunn 指数和惯性,但一切都使用了另一个库。
这是代码:
import numpy as np
import matplotlib.pyplot as plt
import csv
def load_data(file_path):
data = []
with open(file_path, 'r') as csvfile:
csvreader = csv.reader(csvfile)
next(csvreader)
for row in csvreader:
data.append([float(row[0]), float(row[1])])
return np.array(data)
def calculate_distances(data, centers):
data_with_distances = data.copy()
num_centers = centers.shape[0]
for i in range(num_centers):
distances = np.sqrt(((data - centers[i]) ** 2).sum(axis=1))
data_with_distances = np.column_stack((data_with_distances, distances))
return data_with_distances
def get_clusters(data_with_distances):
num_clusters = data_with_distances.shape[1] - 2
cluster_masks = []
for i in range(num_clusters):
mask = data_with_distances[:, i+2] == np.min(data_with_distances[:, 2:num_clusters+2], axis=1)
cluster_masks.append(mask)
clusters = [data_with_distances[mask, :] for mask in cluster_masks]
return clusters
def calculate_centers(clusters):
centers = np.array([cluster.mean(axis=0)[:2] for cluster in clusters])
return centers
def plot_clusters(clusters, centers):
colors = ['blue', 'red', 'green']
for i, cluster in enumerate(clusters):
plt.scatter(cluster[:, 0], cluster[:, 1], color=colors[i])
for center in centers:
plt.scatter(center[0], center[1], color='purple', marker='*', s=150)
plt.xlabel('Household Total Assets')
plt.ylabel('Annual Household Income')
plt.title('K-means Clustering of Household Data')
plt.show()
def run(data, num_clusters, max_iterations=100):
current_centers = np.random.permutation(data)[:num_clusters]
for iteration in range(max_iterations):
data_with_distances = calculate_distances(data, current_centers)
clusters = get_clusters(data_with_distances)
current_centers = calculate_centers(clusters)
plot_clusters(clusters, current_centers)
def main(file_path):
data = load_data(file_path)
for num_clusters in range(2, 11):
run(data, num_clusters)
file_path = 'assessment2dmv.csv'
num_clusters = 3
main(file_path)
对于 dunn 索引,您可以使用 this github repo,您可以在其中找到以下函数:
def dunn(k_list):
""" Dunn index [CVI]
Parameters
----------
k_list : list of np.arrays
A list containing a numpy array for each cluster |c| = number of clusters
c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension)
"""
deltas = np.ones([len(k_list), len(k_list)])*1000000
big_deltas = np.zeros([len(k_list), 1])
l_range = list(range(0, len(k_list)))
for k in l_range:
for l in (l_range[0:k]+l_range[k+1:]):
deltas[k, l] = delta(k_list[k], k_list[l])
big_deltas[k] = big_delta(k_list[k])
di = np.min(deltas)/np.max(big_deltas)
return di
关于惯性,您需要计算(x 是簇 k 的向量,mu 是簇 k 的中心):
代码看起来像这样:
def get_inertia(clusters, centers)
J=0
for center in centers :
for cluster in clusters :
J+=(cluster - center)**2
return J