-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvisualization.py
145 lines (120 loc) · 5.57 KB
/
visualization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""
Visualization for the paper
"""
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from utils import tokenize, make_requirements
from cluster import cluster_requirements
from gensim.models import Doc2Vec
from sklearn.cluster import Birch
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import os
print(__doc__)
# Generating the sample data from make_blobs
# This particular setting has one distinct cluster and 3 clusters placed close
# together.
X, y = make_blobs(n_samples=500,
n_features=2,
centers=4,
cluster_std=1,
center_box=(-10.0, 10.0),
shuffle=True,
random_state=1) # For reproducibility
# feature_paths = [os.path.join(os.getcwd(), 'clean', p) for p in ['clean-file-sharing-features.txt', 'clean-antivirus-features.txt', 'clean-browser-features.txt']]
# model_file_paths = [os.path.join(os.getcwd(), 'model', m) for m in ['model_file_sharing', 'model_antivirus', 'model_browser']]
# requirement_paths = [os.path.join(os.getcwd(), 'requirement', r) for r in ['file-sharing-requirements.txt', 'antivirus-requirements.txt', 'browser-requirements.txt']]
#
# boilerplates = [os.path.join(os.getcwd(), 'boilerplate', b) for b in ['boilerplate-file-sharing.csv', 'boilerplate-antivirus.csv', 'boilerplate-browser.csv']]
NUM_CLUSTERS = 5
feature_path = os.path.join(os.getcwd(), 'clean', 'clean-antivirus-features.txt')
model_path = os.path.join(os.getcwd(), 'model', 'model_antivirus')
# cluster_arr, score = cluster_requirements(model_path, feature_path, NUM_CLUSTERS)
features = tokenize(feature_path)
# load model
model = Doc2Vec.load(model_path)
# test_docs = [x.strip().split() for x in codecs.open(test_docs, "r", "utf-8").readlines()]
X = []
for d in features:
X.append(model.infer_vector(d))
# X.append(m.infer_vector(d, alpha=start_alpha, steps=infer_epoch))
range_n_clusters = [5]
for n_clusters in range_n_clusters:
# Create a subplot with 1 row and 2 columns
fig, ax2 = plt.subplots(1, 1)
fig.set_size_inches(8, 5)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
# ax1.set_xlim([-0.1, 1])
# # The (n_clusters+1)*10 is for inserting blank space between silhouette
# # plots of individual clusters, to demarcate them clearly.
# ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = Birch(branching_factor=50, n_clusters=n_clusters, threshold=0.05, compute_labels=True)
cluster_labels = clusterer.fit_predict(X)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(X, cluster_labels)
print("For n_clusters =", n_clusters,
"The average silhouette_score is :", silhouette_avg)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)
X = TSNE(n_components=2).fit_transform(X)
# y_lower = 10
# for i in range(n_clusters):
# # Aggregate the silhouette scores for samples belonging to
# # cluster i, and sort them
# ith_cluster_silhouette_values = \
# sample_silhouette_values[cluster_labels == i]
#
# ith_cluster_silhouette_values.sort()
#
# size_cluster_i = ith_cluster_silhouette_values.shape[0]
# y_upper = y_lower + size_cluster_i
#
# color = cm.nipy_spectral(float(i) / n_clusters)
# ax1.fill_betweenx(np.arange(y_lower, y_upper),
# 0, ith_cluster_silhouette_values,
# facecolor=color, edgecolor=color, alpha=0.7)
#
# # Label the silhouette plots with their cluster numbers at the middle
# ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
#
# # Compute the new y_lower for next plot
# y_lower = y_upper + 10 # 10 for the 0 samples
#
# ax1.set_title("The silhouette plot for the various clusters.")
# ax1.set_xlabel("The silhouette coefficient values")
# ax1.set_ylabel("Cluster label")
#
# # The vertical line for average silhouette score of all the values
# ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
#
# ax1.set_yticks([]) # Clear the yaxis labels / ticks
# ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# 2nd Plot showing the actual clusters formed
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
c=colors, edgecolor='k')
# Labeling the clusters
# centers = clusterer.subcluster_centers_
# # Draw white circles at cluster centers
# ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
# c="white", alpha=1, s=200, edgecolor='k')
#
# for i, c in enumerate(centers):
# ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
# s=50, edgecolor='k')
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
# plt.suptitle(("Silhouette analysis for Birch clustering algorithm, "
# "n_clusters = %d" % n_clusters),
# fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()