-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathlib_KitNET_calc.py
440 lines (359 loc) · 18.1 KB
/
lib_KitNET_calc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
""" Almost everything related to KitNET is done in this module.
KitNET is an online anomaly detection algorithm based on an ensemble of autoencoders, invented by [Mirsky et al, 2017].
Most of the code below is a modified version of their code, released under the MIT license.
Source: https://github.com/ymirsky/KitNET-py
Paper: Yisroel Mirsky, Tomer Doitshman, Yuval Elovici, and Asaf Shabtai, "Kitsune: An Ensemble of Autoencoders for
Online Network Intrusion Detection", Network and Distributed System Security Symposium 2018 (NDSS'18)
https://arxiv.org/abs/1802.09089
A small part of the code is a modified version of the code by [Yusuke, 2017], released under the MIT license.
Source: https://github.com/yusugomori/DeepLearning .
The corresponding license texts are at end of this file.
"""
import numpy as np
import time
from scipy.cluster.hierarchy import linkage, to_tree
import pickle
from helper_funcs import append_logs, get_full_path, synthetic_data7
use_synthetic_data7 = synthetic_data7()
name4logs = "lib_KitNET_calc"
msg = "Reading Sample dataset..."
append_logs(msg, name4logs, "verbose")
if use_synthetic_data7:
filename = "dataset/syntheticData.txt"
else:
filename = "dataset/fetchedData.txt"
np.seterr(all='ignore')
def sigmoid(x):
return 1. / (1 + np.exp(-x))
class DenoisingAutoencoderParams:
"""A data class for storing the Denoising Autoencoder params."""
def __init__(self, n_visible=5, n_hidden=3, lr=0.001, corruption_level=0.0, grace_period=10000, hidden_ratio=None):
self.n_visible = n_visible # num of units in visible (input) layer
self.n_hidden = n_hidden # num of units in hidden layer
self.lr = lr
self.corruption_level = corruption_level
self.gracePeriod = grace_period
self.hiddenRatio = hidden_ratio
class DenoisingAutoencoder:
"""This class represents Denoising Autoencoder.
Autoencoder is a type of artificial neural network used to learn efficient data codings in an unsupervised manner.
For a popular introduction into Autoencoders, see: https://en.wikipedia.org/wiki/Autoencoder
For Denoising Autoencoders, see:
Vincent et al, 2008. Extracting and Composing Robust Features with Denoising Autoencoders
Yusuke Sugomori, 2013. Stochastic Gradient Descent for Denoising Autoencoders,http://yusugomori.com/docs/SGD_DA.pdf
"""
def __init__(self, params):
self.params = params
if self.params.hiddenRatio is not None:
self.params.n_hidden = int(np.ceil(self.params.n_visible * self.params.hiddenRatio))
# for 0-1 normlaization
self.norm_max = np.ones((self.params.n_visible,)) * -np.Inf
self.norm_min = np.ones((self.params.n_visible,)) * np.Inf
self.n = 0
self.rng = np.random.RandomState(1234)
a = 1. / self.params.n_visible
self.W = np.array(self.rng.uniform( # initialize W uniformly
low=-a,
high=a,
size=(self.params.n_visible, self.params.n_hidden)))
self.hbias = np.zeros(self.params.n_hidden) # initialize h bias 0
self.vbias = np.zeros(self.params.n_visible) # initialize v bias 0
self.W_prime = self.W.T
def get_corrupted_input(self, g_input, corruption_level):
assert corruption_level < 1
return self.rng.binomial(size=g_input.shape,
n=1,
p=1 - corruption_level) * g_input
# Encode
def get_hidden_values(self, e_input):
return sigmoid(np.dot(e_input, self.W) + self.hbias)
# Decode
def get_reconstructed_input(self, hidden):
return sigmoid(np.dot(hidden, self.W_prime) + self.vbias)
def train(self, x):
self.n = self.n + 1
# update norms
self.norm_max[x > self.norm_max] = x[x > self.norm_max]
self.norm_min[x < self.norm_min] = x[x < self.norm_min]
# 0-1 normalize
x = (x - self.norm_min) / (self.norm_max - self.norm_min + 0.0000000000000001)
if self.params.corruption_level > 0.0:
tilde_x = self.get_corrupted_input(x, self.params.corruption_level)
else:
tilde_x = x
y = self.get_hidden_values(tilde_x)
z = self.get_reconstructed_input(y)
l_h2 = x - z
l_h1 = np.dot(l_h2, self.W) * y * (1 - y)
l_vbias = l_h2
l_hbias = l_h1
l_w = np.outer(tilde_x.T, l_h1) + np.outer(l_h2.T, y)
self.W += self.params.lr * l_w
self.hbias += self.params.lr * np.mean(l_hbias, axis=0)
self.vbias += self.params.lr * np.mean(l_vbias, axis=0)
return np.sqrt(np.mean(l_h2 ** 2)) # the RMSE reconstruction error during training
def reconstruct(self, x):
y = self.get_hidden_values(x)
z = self.get_reconstructed_input(y)
return z
def execute(self, x): # returns MSE of the reconstruction of x
if self.n < self.params.gracePeriod:
return 0.0
else:
# 0-1 normalize
x = (x - self.norm_min) / (self.norm_max - self.norm_min + 0.0000000000000001)
z = self.reconstruct(x)
rmse = np.sqrt(((x - z) ** 2).mean()) # MSE
return rmse
def inGrace(self):
return self.n < self.params.gracePeriod
class CorClust:
""" A helper class for KitNET which performs a correlation-based incremental clustering of the dimensions in X
n: the number of dimensions in the dataset
For more information and citation, please see the NDSS'18 paper:
Kitsune: An Ensemble of Autoencoders for Online Network Intrusion Detection
"""
def __init__(self, n):
# parameter:
self.n = n
# varaibles
self.c = np.zeros(n) # linear num of features
self.c_r = np.zeros(n) # linear sum of feature residules
self.c_rs = np.zeros(n) # linear sum of feature residules
self.C = np.zeros((n, n)) # partial correlation matrix
self.N = 0 # number of updates performed
# x: a numpy vector of length n
def update(self, x):
self.N += 1
self.c += x
c_rt = x - self.c / self.N
self.c_r += c_rt
self.c_rs += c_rt ** 2
self.C += np.outer(c_rt, c_rt)
# creates the current correlation distance matrix between the features
def corrDist(self):
c_rs_sqrt = np.sqrt(self.c_rs)
big_c_rs_sqrt = np.outer(c_rs_sqrt, c_rs_sqrt)
big_c_rs_sqrt[
big_c_rs_sqrt == 0] = 1e-100 # protects against dive by zero erros(occurs when a feature is a constant)
dist = 1 - self.C / big_c_rs_sqrt # the correlation distance matrix
dist[
# small negatives may appear due to the incremental fashion in which we update the mean.
# Therefore, we 'fix' them
dist < 0] = 0
return dist
# clusters the features together, having no more than maxClust features per cluster
def cluster(self, max_clust):
corr_dist = self.corrDist()
linkage_matrix = linkage(corr_dist[np.triu_indices(self.n, 1)]) # a linkage matrix based on the distance matrix
if max_clust < 1:
max_clust = 1
if max_clust > self.n:
max_clust = self.n
cluster_map = self.__breakClust__(to_tree(linkage_matrix), max_clust)
return cluster_map
# a recursive helper function which breaks down the dendrogram branches until
# all clusters have no more than maxClust elements
def __breakClust__(self, dendro, max_clust):
if dendro.count <= max_clust: # base case: we found a minimal cluster, so mark it
return [dendro.pre_order()] # return the origional ids of the features in this cluster
return self.__breakClust__(dendro.get_left(), max_clust) + self.__breakClust__(dendro.get_right(), max_clust)
class KitNET:
"""This class represents a KitNET machine learner.
# n: the number of features in your input dataset (i.e., x \in R^n)
# m: the maximum size of any autoencoder in the ensemble layer
# AD_grace_period: the number of instances the network will learn from before producing anomaly scores
# FM_grace_period: the number of instances which will be taken to learn the feature mapping. If 'None',
# then FM_grace_period=AM_grace_period.
# learning_rate: the default stochastic gradient descent learning rate for all autoencoders in the KitNET instance.
# hidden_ratio: the default ratio of hidden to visible neurons. E.g., 0.75 will cause roughly a 25% compression in
# the hidden layer.
# feature_map: One may optionally provide a feature map instead of learning one. The map must be a list, where the
# i-th entry contains a list of the feature indices to be assingned to the i-th autoencoder in the ensemble.
# For example, [[2,5,3],[4,0,1],[6,7]]
For more information and citation, please see the NDSS'18 paper:
Kitsune: An Ensemble of Autoencoders for Online Network Intrusion Detection
"""
def __init__(self, n, max_autoencoder_size=10, fm_grace_period=None, ad_grace_period=10000, learning_rate=0.1,
hidden_ratio=0.75, feature_map=None):
# Parameters:
self.AD_grace_period = ad_grace_period
if fm_grace_period is None:
self.FM_grace_period = ad_grace_period
else:
self.FM_grace_period = fm_grace_period
if max_autoencoder_size <= 0:
self.m = 1
else:
self.m = max_autoencoder_size
self.lr = learning_rate
self.hr = hidden_ratio
self.n = n
# Variables
self.n_trained = 0 # the number of training instances so far
self.n_executed = 0 # the number of executed instances so far
self.v = feature_map
if self.v is None:
append_logs("Feature-Mapper: train-mode, Anomaly-Detector: off-mode", "KitNET lib", "verbose")
else:
self.__createAD__()
append_logs("Feature-Mapper: execute-mode, Anomaly-Detector: train-mode", "KitNET lib", "verbose")
self.FM = CorClust(self.n) # incremental feature cluatering for the feature mapping process
self.ensembleLayer = []
self.outputLayer = None
# If FM_grace_period+AM_grace_period has passed, then this function executes KitNET on x.
# Otherwise, this function learns from x.
# x: a numpy array of length n
# Note: KitNET automatically performs 0-1 normalization on all attributes.
def process(self, x):
if self.n_trained > self.FM_grace_period + self.AD_grace_period: # If both the FM and AD are in execute-mode
return self.execute(x)
else:
self.train(x)
return 0.0
# force train KitNET on x
# returns the anomaly score of x during training (do not use for alerting)
def train(self, x):
# If the FM is in train-mode, and the user has not supplied a feature mapping
if self.n_trained <= self.FM_grace_period and self.v is None:
# update the incremetnal correlation matrix
self.FM.update(x)
if self.n_trained == self.FM_grace_period: # If the feature mapping should be instantiated
self.v = self.FM.cluster(self.m)
self.__createAD__()
t_msg = "The Feature-Mapper found a mapping: " + str(self.n) + " features to " + str(
len(self.v)) + " autoencoders."
append_logs(t_msg, "KitNET lib", "verbose")
t_msg = "Feature-Mapper: execute-mode, Anomaly-Detector: train-mode"
append_logs(t_msg, "KitNET lib", "verbose")
else: # train
# Ensemble Layer
s_l1 = np.zeros(len(self.ensembleLayer))
for a in range(len(self.ensembleLayer)):
# make sub instance for autoencoder 'a'
xi = x[self.v[a]]
s_l1[a] = self.ensembleLayer[a].train(xi)
# OutputLayer
self.outputLayer.train(s_l1)
if self.n_trained == self.AD_grace_period + self.FM_grace_period:
t_msg = "Feature-Mapper: execute-mode, Anomaly-Detector: exeute-mode"
append_logs(t_msg, "KitNET lib", "verbose")
self.n_trained += 1
return self
# force execute KitNET on x
def execute(self, x):
if self.v is None:
e_msg = 'cant execute x, because a feature mapping not learned or provided. Try running process(x) instead.'
append_logs(e_msg, "KitNET lib", "always")
raise RuntimeError(msg)
else:
self.n_executed += 1
# Ensemble Layer
s_l1 = np.zeros(len(self.ensembleLayer))
for a in range(len(self.ensembleLayer)):
# make sub inst
xi = x[self.v[a]]
s_l1[a] = self.ensembleLayer[a].execute(xi)
# OutputLayer
return self.outputLayer.execute(s_l1)
def __createAD__(self):
# construct ensemble layer
for ad_map in self.v:
params = DenoisingAutoencoderParams(n_visible=len(ad_map), n_hidden=0, lr=self.lr, corruption_level=0,
grace_period=0, hidden_ratio=self.hr)
self.ensembleLayer.append(DenoisingAutoencoder(params))
# construct output layer
params = DenoisingAutoencoderParams(len(self.v), n_hidden=0, lr=self.lr, corruption_level=0,
grace_period=0, hidden_ratio=self.hr)
self.outputLayer = DenoisingAutoencoder(params)
def pickle(self):
timestamp = time.time()
obj_to_save = (self, timestamp)
with open(get_full_path('pickled_models/test_file.pkl'), 'wb') as pkl:
pickle.dump(obj_to_save, pkl)
def get_model(input_dataframe):
input_arr = input_dataframe.to_numpy()
dataset_size = len(input_dataframe.index)
# KitNET params:
max_ae = 10 # maximum size for any autoencoder in the ensemble layer
fm_grace = int(
dataset_size * 0.1) # the number of instances taken to learn the feature mapping (the ensemble's architecture)
ad_grace = dataset_size - fm_grace # the number of instances used to train the anomaly detector (ensemble itself)
append_logs("Dataset_size: " + str(dataset_size) + " . FMgrace: " + str(fm_grace) + " . ADgrace: " + str(ad_grace),
name4logs, "verbose")
append_logs("numpy.ndarray tail my input_arr:\n" + str(input_arr[-3:]), name4logs, "verbose")
# Build KitNET
kit_net_obj = KitNET(input_arr.shape[1], max_ae, fm_grace, ad_grace)
model = None
for i in range(input_arr.shape[0]):
if i % 1000 == 0:
g_msg = "progress: " + str(i)
# save_model_to_pickle(model, -1, "pickled_models/kitnet_test_" + str(i) + ".pkl")
append_logs(g_msg, name4logs, "verbose")
model = kit_net_obj.train(input_arr[i, ])
return model, None, True
def ask_model(lmodel, observations_df, scaling): # TODO: use scaling for KitNET too
datapoint = None
try:
datapoint = observations_df.to_numpy()[-1]
rmse_score = lmodel.execute(datapoint)
except Exception as e:
rmse_score = 0
append_logs("ERROR: KitNET ask_model failed. datapoint: " + str(datapoint) + " . Exception: " + str(e),
name4logs, "always")
return rmse_score
# ------------------------------------
# Most of the code above is a modified version of the code by 2017 Yisroel Mirsky, released under the MIT license.
# A small part of the code is a modified version of the code by 2017 Yusuke Sugomori, released under the MIT license.
# The corresponding license texts are below.
# Copyright (c) 2017 Yusuke Sugomori
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# Portions of this code have been adapted from Yusuke Sugomori's code on GitHub:
# https://github.com/yusugomori/DeepLearning
# ----------------------------------------------------------------------
# Copyright (c) 2017 Yisroel Mirsky
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# Portions of this code have been adapted from Yisroel Mirsky's code on GitHub:
# https://github.com/ymirsky/KitNET-py
# For more information and citation, see the NDSS'18 paper:
# Kitsune: An Ensemble of Autoencoders for Online Network Intrusion Detection