一枚NLPer小菜鸡

kmeans numpy实现

k-means算法

k-means作为基本的无监督机器学习算法,在一些面试场景下经常会被拉起来让做手动实现,算法本身其实不难,但在面试场景下复现的准确且简洁则十分重要,因此,本篇博文实现了k-means基本算法,希望大家都可以动手自己实现一遍以保证在需要手写的时候能够快速完成。关于k-means算法的基本原理,我相信大家都应该十分的清楚,因此在这个不多展开介绍,不懂的同学请自行百度或者Google。

具体实现代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import numpy as np
import random
import matplotlib.pyplot as plt

#距离计算以欧式距离为例
def eulDistance(vector1,vevtor2):
return np.sqrt(np.sum(np.power(vector1-vevtor2,2)))


#随机初始化中心点
def initCentroids(dataset,k):
numSamples, dim = dataset.shape
centerid = np.zeros((k,dim))
for i in range(k):
index = int(random.uniform(0,numSamples))
centerid[i,:] =dataset[index,:]
return centerid

#具体实现
def kmeans(dataset,k):
numsamples = dataset.shape[0]

cluster = np.mat(np.zeros((numsamples,2)))

cluseter_flag = True #判断是否簇发生了改变

centroids = initCentroids(dataset,k)

while cluseter_flag:
cluseter_flag =False
#更新每一个样本所属的簇
for i in range(numsamples):
minDis = float('inf')
minIndex = -1
for cls in range(k):
dis_tmp = eulDistance(dataset[i,:],centroids[cls,:])
if dis_tmp < minDis:
minDis = dis_tmp
minIndex = cls

if cluster[i,0]!=minIndex:
cluseter_flag =True
cluster[i] = minIndex,minDis**2

# 更新簇的中心位置
for cls in range(k):

pointers = dataset[np.nonzero(cluster[:,0].A == cls)[0]]

centroids[cls,:] = np.mean(pointers,axis=0)
return centroids, cluster #返回簇中心点和对应每一个样本点的簇分配情况

#画图
def plotCluster(dataSet,k,centroids,cluster):
numsamples ,dim = dataSet.shape

color = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']

for i in range(numsamples):
indx = int(cluster[i,0])
plt.plot(dataSet[i,0],dataSet[i,1],color[indx])

for i in range(k):
plt.plot(centroids[i,0],centroids[i,1],color[i],markersize =12)

plt.savefig("./output.png")

if __name__ == "__main__":
dataset = []
f = open('./ceshi.txt','r')
for line in f.readlines():
lineArr = line.strip().split('\t')
dataset.append([float(lineArr[0]), float(lineArr[1])])

dataset = np.mat(dataset)
k = 4
cenid,cluster = kmeans(dataset,k)

plotCluster(dataset,k,cenid,cluster)

输出如下图所示
avatar

O(∩_∩)O哈哈~