-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathknn_clustering.py
109 lines (93 loc) · 3.31 KB
/
knn_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 22 22:43:35 2020
@author: sg
"""
import numpy as np
import matplotlib.pyplot as plt
import random
import pandas as pd
def random_centers(dim,k):
centers = []
for i in range(k):
center = []
for d in range(dim):
rand = random.randint(0,100)
center.append(rand)
centers.append(center)
return centers
def point_clustering(data, centers, dims, first_cluster=False):
for point in data:
nearest_center = 0
nearest_center_dist = None
for i in range(0, len(centers)):
euclidean_dist = 0
for d in range(0, dims):
dist = abs(point[d] - centers[i][d])
euclidean_dist += dist
euclidean_dist = np.sqrt(euclidean_dist)
if nearest_center_dist == None:
nearest_center_dist = euclidean_dist
nearest_center = i
elif nearest_center_dist > euclidean_dist:
nearest_center_dist = euclidean_dist
nearest_center = i
if first_cluster:
point.append(nearest_center)
else:
point[-1] = nearest_center
return data
def mean_center(data, centers, dims):
print('centers:', centers, 'dims:', dims)
new_centers = []
for i in range(len(centers)):
new_center = []
n_of_points = 0
total_of_points = []
for point in data:
if point[-1] == i:
n_of_points += 1
for dim in range(0,dims):
if dim < len(total_of_points):
total_of_points[dim] += point[dim]
else:
total_of_points.append(point[dim])
if len(total_of_points) != 0:
for dim in range(0,dims):
print(total_of_points, dim)
new_center.append(total_of_points[dim]/n_of_points)
new_centers.append(new_center)
else:
new_centers.append(centers[i])
return new_centers
# Gets data and k, returns a list of center points.
def train_k_means_clustering(data, k=2, epochs=5):
dims = len(data[0])
print('data[0]:',data[0])
centers = random_centers(dims,k)
clustered_data = point_clustering(data, centers, dims, first_cluster=True)
for i in range(epochs):
centers = mean_center(clustered_data, centers, dims)
clustered_data = point_clustering(data, centers, dims, first_cluster=False)
return centers
def predict_k_means_clustering(point, centers):
dims = len(point)
center_dims = len(centers[0])
if dims != center_dims:
raise ValueError('Point given for prediction have', dims, 'dimensions but centers have', center_dims, 'dimensions')
nearest_center = None
nearest_dist = None
for i in range(len(centers)):
euclidean_dist = 0
for dim in range(1, dims):
dist = point[dim] - centers[i][dim]
euclidean_dist += dist**2
euclidean_dist = np.sqrt(euclidean_dist)
if nearest_dist == None:
nearest_dist = euclidean_dist
nearest_center = i
elif nearest_dist > euclidean_dist:
nearest_dist = euclidean_dist
nearest_center = i
print('center:',i, 'dist:',euclidean_dist)
return nearest_center