-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
382a046
commit 2cb7286
Showing
8 changed files
with
188 additions
and
180 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,5 @@ | ||
# KMeans_cpp | ||
An algorithm to implement K Means clustering method in C++ | ||
|
||
An algorithm to implement K Means clustering method in C++. | ||
|
||
Curently it only takes 1D dataset of floats. It'll be updated to take 3D dataset soon. |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
#include "kmeans.h" | ||
|
||
|
||
|
||
//The constructor initializes some variables | ||
KMeans::KMeans(int n_clusters, int data_size_in) | ||
{ | ||
num_clusters = n_clusters; | ||
data_size = data_size_in; | ||
vector<vector<float>> R (data_size, vector<float>(num_clusters)); | ||
static vector<vector<int>> indices (num_clusters, vector<int> (data_size, -1)); | ||
this->indices = indices; | ||
}; | ||
|
||
//To find the initial centroids, "random" points are taken from the dataset | ||
void KMeans::def_initial_centroids(vector<float> data_in, int data_size_in, int n_clusters, char method) | ||
{ | ||
srand(time(0)); | ||
//TODO: Implement a check for data_size_in and n_clusters | ||
vector<float>::iterator it; | ||
data = data_in; | ||
data_size = data_size_in; | ||
num_clusters = n_clusters; | ||
int index[num_clusters]; | ||
|
||
//First we initialize an array with n_clusters | ||
for (int j=0;j<data_size;j++) | ||
{ | ||
index[j] = j; | ||
} | ||
//now we shuffle it to get "random" centroids | ||
shuffle(index,data_size); | ||
// and we assign an initial value to the cluster | ||
vector<int> temp; | ||
for (int i=0;i<num_clusters;i++) | ||
{ | ||
it = data_in.begin() + index[i]; | ||
temp.push_back(index[i]); | ||
centroids.push_back(*it); | ||
} | ||
// indices.insert(indices.begin(),temp); | ||
} | ||
|
||
//The clustering function will assign everypoint of the dataset to a cluster and recalculate the centroid as the mean of everypoint on it | ||
|
||
void KMeans::clustering(vector<float> data_in) | ||
{ | ||
vector<vector<float>> R (data_size, vector<float>(num_clusters)); | ||
vector<float> ctemp; | ||
vector<float> sub_vector; | ||
vector<float>::iterator it_d; | ||
vector<float>::iterator it_c; | ||
ctemp = centroids; | ||
vector<vector<float>> dists(data_size, vector<float> (num_clusters, 0)); | ||
int min_ind; | ||
float min_dis; | ||
|
||
|
||
R.insert(R.begin(), ctemp); | ||
|
||
//Calculate all distances between the centroids and the data points | ||
int n =0, i=0; | ||
for (it_d = data_in.begin();it_d<data_in.end();it_d++) | ||
{ | ||
i=0; | ||
for(it_c = centroids.begin();it_c<centroids.end();it_c++) | ||
{ | ||
dists[n][i] = distance(*it_d,*it_c); | ||
i++; | ||
} | ||
//Find the closest centroid and assign the data point to it | ||
sub_vector.reserve(num_clusters); | ||
sub_vector = dists[n]; | ||
vector<float>::iterator result = min_element(sub_vector.begin(), sub_vector.end()); | ||
min_ind = std::distance(sub_vector.begin(), result); | ||
min_dis = *result; | ||
R[n+1][min_ind]=data_in[n]; | ||
indices[min_ind][n] = n; | ||
recalculate_centroid(min_ind,n,R); | ||
|
||
n++; | ||
} | ||
//clear the '-1' from the vector | ||
vector<int>::iterator ind; | ||
for(int i = 0;i<num_clusters;i++) | ||
{ | ||
do{ | ||
ind = find(indices[i].begin(), indices[i].end(), -1); | ||
if(ind != indices[i].end() ) indices[i].erase(ind); | ||
}while(ind != indices[i].end() ); | ||
} | ||
|
||
} | ||
|
||
//This function will print the centroids | ||
void KMeans::print_centroids() | ||
{ | ||
vector<float>::iterator it; | ||
cout<<"There are " << num_clusters << " centroids :"<<endl; | ||
for (it = centroids.begin(); it != centroids.end(); ++it) | ||
cout<<*it<<" "; | ||
|
||
cout<<endl; | ||
} | ||
|
||
//This function is auxiliary to define the initial centroids | ||
void KMeans::shuffle(int *arr, size_t n) | ||
{ | ||
if (n > 1) | ||
{ | ||
size_t i; | ||
srand(time(NULL)); | ||
for (i = 0; i < n - 1; i++) | ||
{ | ||
size_t j = i + rand() / (RAND_MAX / (n - i) + 1); | ||
int t = arr[j]; | ||
arr[j] = arr[i]; | ||
arr[i] = t; | ||
} | ||
} | ||
} | ||
|
||
//function to calculate the distance betweem two points | ||
float KMeans::distance(float in1, float in2) | ||
{ | ||
return abs(abs(in1)-abs(in2)); | ||
|
||
} | ||
|
||
//function to update the centroid based | ||
void KMeans::recalculate_centroid(int c_index, int d_index,vector<vector<float>> vIn) | ||
{ | ||
int count=0; | ||
float res=0; | ||
for(int i=0;i<(d_index+2);i++) | ||
{ | ||
if(vIn[i][c_index]>0) | ||
{ | ||
count ++; | ||
res += vIn[i][c_index]; | ||
} | ||
|
||
} | ||
res = res/count; | ||
centroids[c_index] =res; | ||
} | ||
|
||
//function to print the members of a cluster | ||
void KMeans::print_members(int ind_centroid) | ||
{ | ||
// cout<<indices[0][0]<<endl; | ||
// cout<<indices[0][1]<<endl; | ||
vector<int>::iterator it; | ||
cout<<"These are the members of cluster: " << ind_centroid <<endl; | ||
for (it = this->indices[ind_centroid].begin(); it != this->indices[ind_centroid].end(); ++it) cout<<data[*it]<<" "; | ||
cout<<endl; | ||
|
||
} | ||
|
||
//function that return the members of a cluster | ||
vector<float> KMeans::get_members(int ind_centroid) | ||
{ | ||
vector<float> sub; | ||
vector<int>::iterator it; | ||
for (it = this->indices[ind_centroid].begin(); it != this->indices[ind_centroid].end(); ++it) | ||
sub.push_back(data[*it]); | ||
|
||
return sub; | ||
} | ||
|
||
//function that returns the indices of the cluster members. | ||
//Note that the indices are related to the original dataset | ||
vector<int> KMeans::get_index_members(int ind_centroid) | ||
{ | ||
vector<int> sub; | ||
vector<int>::iterator it; | ||
for (it = this->indices[ind_centroid].begin(); it != this->indices[ind_centroid].end(); ++it) | ||
sub.push_back(*it); | ||
|
||
return sub; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.