-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathk_medoids_better.py
118 lines (95 loc) · 4.2 KB
/
k_medoids_better.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# https://pyclustering.github.io/docs/0.10.1/html/d0/dd3/classpyclustering_1_1cluster_1_1kmedoids_1_1kmedoids.html
import time
import os
from nltk import edit_distance
from pyclustering.cluster.kmedoids import kmedoids
import numpy as np
import pandas as pd
import argparse
np.random.seed(1234)
ap = argparse.ArgumentParser()
ap.add_argument("-k", "--n_medoids", default=500, type=int,
help="number of medoids (prototypes)")
args = vars(ap.parse_args())
def distance_measuring_(seq1, seq2):
edit_value = edit_distance(seq1, seq2)
return edit_value
def get_key(x):
return x[1]
def _get_init_centers(n_clusters, samples):
"""return random points as initial centers"""
# randomly choose one initial sequence as the first center
init_ids = [np.random.randint(0, len(samples))]
init_sequence = samples[init_ids[0]] # sequence
all_distances = []
# the idea is to choose medoids far away from each other
for _ in range(1, len(samples)):
current_seq = samples[_]
all_distances.append((_, distance_measuring_(init_sequence, current_seq)))
arranged = sorted(all_distances, key=get_key,
reverse=True) # descending order
arranged_ids = [_[0] for _ in arranged]
init_ids += arranged_ids[:n_clusters - 1]
return init_ids
def get_distance_matrix(samples):
dist_mat = np.zeros((len(samples), len(samples)))
start = time.time()
for j in range(len(samples)):
for i in range(len(samples)):
if i == j:
dist_mat[i, j] = 0.
else:
dist_mat[i, j] = distance_measuring_(samples[i], samples[j])
end = time.time()
delay = (end - start) / 60
print('Time of execution: {} minutes'.format(delay))
return dist_mat
def main(args):
dataset = pd.read_csv('D1.csv')
amps = dataset[dataset['Label'] == 1]['Sequences']
sequences = amps.tolist()
k = args.n_medoids
initial_medoids = _get_init_centers(k, sequences)
if not os.path.exists('distance_matrix_D1.npy'):
print('Building the matrix distance...')
distance_matrix = get_distance_matrix(sequences)
np.save('distance_matrix_D1', distance_matrix)
else:
print('Loading existing distance matrix...')
if not os.path.exists('medoids_k_{}.npy'.format(k)):
def_distance_matrix = np.load('distance_matrix_D1.npy')
matrix = def_distance_matrix.tolist()
start = time.time()
km = kmedoids(matrix, initial_medoids, data_type='distance_matrix')
km.process()
centers = km.get_medoids()
clusters = km.get_clusters()
end = time.time()
delay = (end - start) / 60
print('Time of K-medoid (PAM) execution: {} minutes'.format(delay))
print(centers)
print(clusters)
np.save('medoids_k_{}'.format(k),
centers) # saves final medoids ids --- we can easily retrieve the sequences later using those ids
np.save('clusters_k_{}'.format(k), clusters) # saves final clusters members ids
sequences_medoids = dataset["Sequences"][centers].tolist()
print(sequences_medoids)
# filtering out medoids who appears to be members of their own clusters
clusters_sequences = {med: dataset["Sequences"][list(set(members).difference(set(centers)))].tolist() for
med, members in zip(sequences_medoids,
clusters)}
print(clusters_sequences)
else:
print('Loading existing k-medoids ...')
medoids = np.load('medoids_k_{}.npy'.format(k), allow_pickle=True)
clusters = np.load('clusters_k_{}.npy'.format(k), allow_pickle=True)
sequences_medoids = dataset["Sequences"][medoids].tolist()
print(sequences_medoids)
# filtering out medoids who appears to be members of their own clusters
clusters_sequences = {med: dataset["Sequences"][list(set(members).difference(set(medoids)))].tolist() for
med, members in zip(sequences_medoids,
clusters.tolist())}
print(clusters_sequences)
if __name__ == '__main__':
args = ap.parse_args()
main(args)