-
Notifications
You must be signed in to change notification settings - Fork 1
/
dump_vis_timeline_data.py
125 lines (109 loc) · 4 KB
/
dump_vis_timeline_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from check_k_best_trees import k_best_trees
from meta_graph_stat import MetaGraphStat, build_default_summary_kws_from_path
from datetime import datetime
from collections import Counter
def format_time(dt):
if dt.year < 1900:
return str(dt)
else:
return datetime.strftime(dt, '%Y-%m-%d %H:%M:%S')
def run(cand_trees, k, summary_kws, undirected):
interactions = summary_kws['topics']['interactions']
mid2i = {
i['message_id']: i
for i in interactions
}
trees = k_best_trees(cand_trees, k)
summaries = [MetaGraphStat(t, summary_kws).summary_dict() for t in trees]
items = []
groups = []
start_times = []
end_times = []
added_id_count = Counter()
counter = 0
for group_id, (summ, t) in enumerate(zip(summaries, trees)):
group_id += 1
for i in t.nodes_iter():
counter += 1
items.append({
'id': counter,
'content': (mid2i[i]['subject'].strip()
if mid2i[i]['subject'] else
mid2i[i]['body']),
'start': format_time(mid2i[i]['datetime']),
'group': group_id
})
added_id_count[i] += 1
counter += 1
items.append(
{
'id': counter,
# 'id': 'event_{}'.format(group_id),
'start': format_time(summ['time_span']['start_time']),
'end': format_time(summ['time_span']['end_time']),
'content': 'Event {}'.format(group_id),
'group': group_id,
'type': 'background'
})
g = {
'id': group_id,
'terms': summ['topics']['topic_terms'],
# 'terms': summ['frequent_terms'],
# 'terms': summ['tdidf_terms'],
'participants': dict(
summ['participants']['participant_count']
),
'start': format_time(summ['time_span']['start_time']),
'end': format_time(summ['time_span']['end_time']),
'days': (summ['time_span']['end_time'] - summ['time_span']['start_time']).days,
'link_type_freq': summ['link_type_freq']
}
if 'hashtags' in summ:
g['hashtags'] = summ['hashtags']
groups.append(g)
start_times.append(summ['time_span']['start_time'])
end_times.append(summ['time_span']['end_time'])
return {
'items': items,
'groups': groups,
'start': format_time(min(start_times)),
'end': format_time(max(end_times))
}
def main():
import argparse
import cPickle as pkl
from util import json_dump
parser = argparse.ArgumentParser('dump vis timeline data')
parser.add_argument('--cand_trees_path', required=True)
parser.add_argument('--output_path', required=True)
parser.add_argument('--interactions_path', required=True)
parser.add_argument('--people_path', required=True)
parser.add_argument('--corpus_dict_path', required=True)
parser.add_argument('--lda_model_path', required=True)
parser.add_argument('--people_repr_template', type=str,
default="{id}")
parser.add_argument('-k', type=int, default=10)
parser.add_argument('--undirected', default=False, action="store_true")
args = parser.parse_args()
summary_kws = build_default_summary_kws_from_path(
args.interactions_path,
args.people_path,
args.corpus_dict_path,
args.lda_model_path,
args.people_repr_template,
undirected=args.undirected
)
trees = pkl.load(open(args.cand_trees_path))
# add hashtags if there
print(len(trees))
first_node = trees[0].nodes()[0]
if 'hashtags' in trees[0].node[first_node]:
print('add hashtags')
summary_kws['hashtags'] = {}
data = run(trees,
args.k,
summary_kws,
args.undirected)
json_dump(data, args.output_path)
if __name__ == '__main__':
main()