-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmeans.cpp
181 lines (156 loc) · 4.99 KB
/
kmeans.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#include "kmeans.h"
//The constructor initializes some variables
KMeans::KMeans(int n_clusters, int data_size_in)
{
num_clusters = n_clusters;
data_size = data_size_in;
vector<vector<float>> R (data_size, vector<float>(num_clusters));
static vector<vector<int>> indices (num_clusters, vector<int> (data_size, -1));
this->indices = indices;
};
//To find the initial centroids, "random" points are taken from the dataset
void KMeans::def_initial_centroids(vector<float> data_in, int data_size_in, int n_clusters, char method)
{
srand(time(0));
//TODO: Implement a check for data_size_in and n_clusters
vector<float>::iterator it;
data = data_in;
data_size = data_size_in;
num_clusters = n_clusters;
int index[num_clusters];
//First we initialize an array with n_clusters
for (int j=0;j<data_size;j++)
{
index[j] = j;
}
//now we shuffle it to get "random" centroids
shuffle(index,data_size);
// and we assign an initial value to the cluster
vector<int> temp;
for (int i=0;i<num_clusters;i++)
{
it = data_in.begin() + index[i];
temp.push_back(index[i]);
centroids.push_back(*it);
}
// indices.insert(indices.begin(),temp);
}
//The clustering function will assign everypoint of the dataset to a cluster and recalculate the centroid as the mean of everypoint on it
void KMeans::clustering(vector<float> data_in)
{
vector<vector<float>> R (data_size, vector<float>(num_clusters));
vector<float> ctemp;
vector<float> sub_vector;
vector<float>::iterator it_d;
vector<float>::iterator it_c;
ctemp = centroids;
vector<vector<float>> dists(data_size, vector<float> (num_clusters, 0));
int min_ind;
float min_dis;
R.insert(R.begin(), ctemp);
//Calculate all distances between the centroids and the data points
int n =0, i=0;
for (it_d = data_in.begin();it_d<data_in.end();it_d++)
{
i=0;
for(it_c = centroids.begin();it_c<centroids.end();it_c++)
{
dists[n][i] = distance(*it_d,*it_c);
i++;
}
//Find the closest centroid and assign the data point to it
sub_vector.reserve(num_clusters);
sub_vector = dists[n];
vector<float>::iterator result = min_element(sub_vector.begin(), sub_vector.end());
min_ind = std::distance(sub_vector.begin(), result);
min_dis = *result;
R[n+1][min_ind]=data_in[n];
indices[min_ind][n] = n;
recalculate_centroid(min_ind,n,R);
n++;
}
//clear the '-1' from the vector
vector<int>::iterator ind;
for(int i = 0;i<num_clusters;i++)
{
do{
ind = find(indices[i].begin(), indices[i].end(), -1);
if(ind != indices[i].end() ) indices[i].erase(ind);
}while(ind != indices[i].end() );
}
}
//This function will print the centroids
void KMeans::print_centroids()
{
vector<float>::iterator it;
cout<<"There are " << num_clusters << " centroids :"<<endl;
for (it = centroids.begin(); it != centroids.end(); ++it)
cout<<*it<<" ";
cout<<endl;
}
//This function is auxiliary to define the initial centroids
void KMeans::shuffle(int *arr, size_t n)
{
if (n > 1)
{
size_t i;
srand(time(NULL));
for (i = 0; i < n - 1; i++)
{
size_t j = i + rand() / (RAND_MAX / (n - i) + 1);
int t = arr[j];
arr[j] = arr[i];
arr[i] = t;
}
}
}
//function to calculate the distance betweem two points
float KMeans::distance(float in1, float in2)
{
return abs(abs(in1)-abs(in2));
}
//function to update the centroid based
void KMeans::recalculate_centroid(int c_index, int d_index,vector<vector<float>> vIn)
{
int count=0;
float res=0;
for(int i=0;i<(d_index+2);i++)
{
if(vIn[i][c_index]>0)
{
count ++;
res += vIn[i][c_index];
}
}
res = res/count;
centroids[c_index] =res;
}
//function to print the members of a cluster
void KMeans::print_members(int ind_centroid)
{
// cout<<indices[0][0]<<endl;
// cout<<indices[0][1]<<endl;
vector<int>::iterator it;
cout<<"These are the members of cluster: " << ind_centroid <<endl;
for (it = this->indices[ind_centroid].begin(); it != this->indices[ind_centroid].end(); ++it) cout<<data[*it]<<" ";
cout<<endl;
}
//function that return the members of a cluster
vector<float> KMeans::get_members(int ind_centroid)
{
vector<float> sub;
vector<int>::iterator it;
for (it = this->indices[ind_centroid].begin(); it != this->indices[ind_centroid].end(); ++it)
sub.push_back(data[*it]);
return sub;
}
//function that returns the indices of the cluster members.
//Note that the indices are related to the original dataset
vector<int> KMeans::get_index_members(int ind_centroid)
{
vector<int> sub;
vector<int>::iterator it;
for (it = this->indices[ind_centroid].begin(); it != this->indices[ind_centroid].end(); ++it)
sub.push_back(*it);
return sub;
}