-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathkmeans.py
112 lines (99 loc) · 4.42 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# -*- coding: utf-8 -*-
import numpy as np
import warnings
class KMeansClassifier():
"""
Kmeans聚类
"""
def __init__(self, k=3, initCent='random', max_iter=500 ):
self._k = k
self._initCent = initCent
self._max_iter = max_iter
self._clusterAssment = None
self._labels = None
self._sse = None
def _calEDist(self, arrA, arrB):
"""
功能:欧拉距离距离计算
输入:两个一维数组
"""
return np.math.sqrt(sum(np.power(arrA-arrB, 2)))
def _calMDist(self, arrA, arrB):
"""
功能:曼哈顿距离距离计算
输入:两个一维数组
"""
return sum(np.abs(arrA-arrB))
def _randCent(self, data_X, k):
"""
功能:随机选取k个质心
输出:centroids #返回一个m*n的质心矩阵
"""
n = data_X.shape[1] #获取特征的维数
centroids = np.empty((k,n)) #使用numpy生成一个k*n的矩阵,用于存储质心
for j in range(n):
minJ = min(data_X[:, j])
rangeJ = float(max(data_X[:, j] - minJ))
#使用flatten拉平嵌套列表(nested list)
centroids[:, j] = (minJ + rangeJ * np.random.rand(k, 1)).flatten()
return centroids
def fit(self, data_X):
"""
输入:一个m*n维的矩阵
"""
if not isinstance(data_X, np.ndarray) or \
isinstance(data_X, np.matrixlib.defmatrix.matrix):
try:
data_X = np.asarray(data_X)
except:
raise TypeError("numpy.ndarray resuired for data_X")
m = data_X.shape[0] #获取样本的个数
#一个m*2的二维矩阵,矩阵第一列存储样本点所属的族的索引值,
#第二列存储该点与所属族的质心的平方误差
self._clusterAssment = np.zeros((m,2))
if self._initCent == 'random':
self._centroids = self._randCent(data_X, self._k)
clusterChanged = True
for _ in range(self._max_iter): #使用"_"主要是因为后面没有用到这个值
clusterChanged = False
for i in range(m): #将每个样本点分配到离它最近的质心所属的族
minDist = np.inf #首先将minDist置为一个无穷大的数
minIndex = -1 #将最近质心的下标置为-1
for j in range(self._k): #次迭代用于寻找最近的质心
arrA = self._centroids[j,:]
arrB = data_X[i,:]
distJI = self._calEDist(arrA, arrB) #计算误差值
if distJI < minDist:
minDist = distJI
minIndex = j
if self._clusterAssment[i, 0] != minIndex or self._clusterAssment[i, 1] > minDist**2:
clusterChanged = True
self._clusterAssment[i,:] = minIndex, minDist**2
if not clusterChanged:#若所有样本点所属的族都不改变,则已收敛,结束迭代
break
for i in range(self._k):#更新质心,将每个族中的点的均值作为质心
index_all = self._clusterAssment[:,0] #取出样本所属簇的索引值
value = np.nonzero(index_all==i) #取出所有属于第i个簇的索引值
ptsInClust = data_X[value[0]] #取出属于第i个簇的所有样本点
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=RuntimeWarning)
self._centroids[i,:] = np.mean(ptsInClust, axis=0) #计算均值
self._labels = self._clusterAssment[:,0]
self._sse = sum(self._clusterAssment[:,1])
def predict(self, X):#根据聚类结果,预测新输入数据所属的族
#类型检查
if not isinstance(X,np.ndarray):
try:
X = np.asarray(X)
except:
raise TypeError("numpy.ndarray required for X")
m = X.shape[0]#m代表样本数量
preds = np.empty((m,))
for i in range(m):#将每个样本点分配到离它最近的质心所属的族
minDist = np.inf
for j in range(self._k):
distJI = self._calEDist(self._centroids[j,:], X[i,:])
if distJI < minDist:
minDist = distJI
preds[i] = j
return preds