-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
62 lines (54 loc) · 2.42 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import pandas as pd
import hashlib
import numpy as np
from operator import itemgetter
import sys
np.set_printoptions(suppress=True)
df = pd.read_csv(sys.argv[1])
df.columns = [c.strip() for c in df.columns]
# We are not using traceID for now, we are using debugID
df = df.drop('traceID',1)
# create md5 code to remove duplicate packets
def f(x):
com = str(str(x[1]) + str(x[2]) + str(x[3]) + str(x[4])).encode('utf-8')
return hashlib.md5(com).hexdigest()
df['code'] = df.apply(f, axis=1)
# TODO: why there are two packets?!
print('95 percentile of difference between duplicates:', np.percentile(df.groupby('code')['ts'].diff().fillna(0).values, 95))
print('99 percentile of difference between duplicates:', np.percentile(df.groupby('code')['ts'].diff().fillna(0).values, 99))
# we consider the mean of every two duplicate packets
mean_of_times = df.groupby('code')['ts'].mean().reset_index()
df.drop_duplicates(subset=['code'],keep='first',inplace=True)
df = df.drop('ts',1) #there will be 2 ts so we drop one
# merge the mean values of ts and get rid of code, we don't need it anymore
df = df.merge(mean_of_times, on='code')
df = df.drop('code', 1)
# some of the packets don't know their request type, we find them based on debugID
debugID2req = {}
for idx, row in df.iterrows():
if not pd.isnull(row['req']):
debugID2req[row['debugID']] = row['req']
df['req'] = df.apply(lambda row: debugID2req[row['debugID']] if pd.isnull(row['req']) and row['debugID'] in debugID2req else row['req'], axis=1)
# final analyse on data
data = dict(tuple(df.groupby('req')))
detailed_printed = set()
for key, df in data.items():
byDebugID = list(tuple(df.groupby('debugID')))
if len(byDebugID) == 0 : continue
# first or last requests my not be captured completely, so we choose the middle one as base (int(len()/2))
diffs = np.zeros((len(byDebugID), byDebugID[int(len(byDebugID)/2)][1].shape[0]-1))
for i, tmp in enumerate(byDebugID):
debugID = tmp[0]
request = tmp[1]
request = request.sort_values(by="ts")
# print(key)
# print(request)
# print(request.shape[0])
if request.shape[0] != diffs.shape[-1] + 1:
continue
if key not in detailed_printed:
detailed_printed.add(key)
print(request[['src','dst']])
diffs[i] = np.diff(request.ts)
diffs = np.percentile(diffs, 95, axis=0)
print(key, len(byDebugID), np.round(diffs, 3))