-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbloomberg_util.py
63 lines (48 loc) · 1.73 KB
/
bloomberg_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from pymongo import MongoClient
import itertools
from util import json_dump, json_load
TAG_PREFIX = 'topics/companies/'
def get_companies(article):
return [t.split('/')[-1]
for t in article['tags']
if t.startswith(TAG_PREFIX)]
def has_multiple_companies(article):
companies = get_companies(article)
return len(companies) > 1
def transform_article(a):
"""some rename of fields
"""
return {
'message_id': a['url'],
'subject': a['title'],
'body': ' '.join(a['body']),
'timestamp': a['publish_time'],
'participant_ids': get_companies(a)
}
def articles_articles(db, collection_name):
valid_articles = []
for a in db[collection_name].find():
# filter out articles with single company tag
if has_multiple_companies(a):
valid_articles.append(
transform_article(a)
)
return valid_articles
def dump2interactions(db, collection_name, output_path):
articles = articles_articles(db, collection_name)
print('# valid articles: ', len(articles))
json_dump(articles, output_path)
return articles
def collect_people_info(articles):
participant_ids = set(
itertools.chain(*[a['participant_ids'] for a in articles])
)
print('# unique participants: ', len(participant_ids))
return [{'id': p} for p in participant_ids]
if __name__ == '__main__':
# articles = dump2interactions(MongoClient()['bloomberg'],
# 'articles',
# 'data/bloomberg/interactions.json')
articles = json_load('data/bloomberg/interactions.json')
json_dump(collect_people_info(articles),
'data/bloomberg/people.json')