-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdump_events_to_vega_format.py
70 lines (57 loc) · 1.85 KB
/
dump_events_to_vega_format.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import cPickle as pkl
import pandas as pd
import ujson as json
from datetime import datetime as dt
from check_k_best_trees import k_best_trees
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--result_path')
parser.add_argument('--interactions_path')
parser.add_argument('--output_path')
parser.add_argument('--non_event_sample_n', type=int)
parser.add_argument('--k', type=int)
args = parser.parse_args()
result = pkl.load(open(args.result_path))
trees = k_best_trees(result, args.k)
df = pd.read_json(args.interactions_path)
dt_format = '%Y-%m-%dT%H:%M:%S.000Z'
data = []
event_nodes = set()
for i, t in enumerate(trees):
for n in t.nodes_iter():
event_nodes.add(n)
# print(t.node[n]['datetime'])
data.append(
{
'series': 'event-{}'.format(i+1),
'datetime': t.node[n]['datetime'].strftime(dt_format)
}
)
# for enron:
df = df[df['datetime'] > dt(2000, 6, 1)]
# for ukraine:
# df = df[df['datetime'] > dt(2015, 2, 26)]
# for baltimore
# df = df[df['datetime'] >= dt(2015, 4, 27)]
if args.non_event_sample_n:
print df.shape
df = df[df['message_id'].map(lambda m: m not in event_nodes)]
df = df.sample(n=args.non_event_sample_n)
print df.shape
for i, r in df.iterrows():
# print(r)
# print(r['datetime'])
if r['message_id'] not in event_nodes:
data.append(
{
'series': 'non-event',
'datetime': r['datetime'].strftime(dt_format)
}
)
else:
# print "drop"
pass
json.dump(data, open(args.output_path, 'w'))
if __name__ == '__main__':
main()