python实现K-means算法——以鸢尾花数据集为例
import matplotlib.pyplot as plt from random import sample import numpy as np from sklearn.cluster import KMeans #from sklearn import datasets from sklearn.datasets import load_iris #导入鸢尾花数据集 #以二维数据为例 假设k=2,X为鸢尾花数据集前两维 iris = load_iris() X = iris.data[:,0:2] ##表示我们只取特征空间中的前两个维度 X类型是np.array print(len(X)) #绘制数据分布图 显示前两维数据 plt.scatter(X[:, 0], X[:, 1], c = "red", marker=o, label=see) plt.xlabel(petal length) plt.ylabel(petal width) plt.legend(loc=2) plt.show() #从X中随机选择k个样本作为初始“簇中心”向量: μ(1),μ(2),...,,μ(k) #随机获得两个数据 n = 3 #表示n个聚类 u = sample(X.tolist(),n) #随机选择n个X中的向量作为聚类中心 max_iter = 0 #记录迭代次数 u_before = u while max_iter<5: #簇分配过程 c = [] print(u_before,u) for i in range(len(X)): min = 1000 index = 0 for j in range(n): vec1 = X[i] vec2 = u[j] dist = np.sqrt(np.sum(np.square(vec1 - vec2))) if dist<min: min = dist index =j c.append(index) # print(i,"------",c[i]) #移动聚类中心 for j in range(n): sum = np.zeros(2) # 定义n为零向量 此处n为2 count = 0 # 统计不同类别的个数 for i in range(len(X)): if c[i]==j: sum = sum+X[i] count = count+1 u[j] = sum/count print(max_iter,"------------",u) #设置迭代次数 max_iter = max_iter + 1 print(np.array(c)) label_pred = np.array(c) #绘制k-means结果 x0 = X[label_pred == 0] x1 = X[label_pred == 1] x2 = X[label_pred == 2] plt.scatter(x0[:, 0], x0[:, 1], c = "red", marker=o, label=label0) plt.scatter(x1[:, 0], x1[:, 1], c = "green", marker=*, label=label1) plt.scatter(x2[:, 0], x2[:, 1], c = "blue", marker=+, label=label2) plt.xlabel(petal length) plt.ylabel(petal width) plt.legend(loc=2) plt.show()
最后结果如下图所示: