-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload_dataflow_export.py
74 lines (56 loc) · 1.99 KB
/
load_dataflow_export.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import warnings
from data_tools import parse_dataflow_export, tokenize_tweet, load_crawled_terms
from bs4 import MarkupResemblesLocatorWarning
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
crawled_terms, crawled_hashtags, crawled_phrases = load_crawled_terms(
"./keywords-3nov.txt", split_hashtags=True
)
def parse_str_to_int(s):
return int(s) if s.isdigit() else 0
def parse_tweet(tweet):
cleaned_text, tokens, hashtags, entities = tokenize_tweet(tweet["text"])
# tweet["cleaned_text"] = cleaned_text
tweet["tokens"] = tokens
tweet["hashtags"] = hashtags
tweet["entities"] = entities
tweet["retweet_count"] = parse_str_to_int(tweet["retweet_count"])
if "quote_count" in tweet:
tweet["quote_count"] = parse_str_to_int(tweet["quote_count"])
else:
tweet["quote_count"] = 0
# Drop irrelevant columns
del tweet["sha256"]
del tweet["source"]
del tweet["coordinates"]
del tweet["processed"]
del tweet["media"]
del tweet["place"]
lowered_tweet_text = tweet["text"].lower()
for hashtag in tweet["hashtags"]:
lowered_hashtag = hashtag.lower()
for crawled_hashtag in crawled_hashtags:
if lowered_hashtag == crawled_hashtag:
tweet["#" + crawled_hashtag] = 1
for phrase in crawled_phrases:
if phrase in lowered_tweet_text:
tweet[phrase] = 1
return tweet
EXPORT_TAG = "16-dec"
data_sources = [
{"type": "retweets"},
{"type": "tweets", "parser": parse_tweet},
{"type": "users"},
{"type": "media"},
# {"type": "hashtag"},
]
for data_source in data_sources:
data_type = data_source["type"]
data_parser = data_source["parser"] if "parser" in data_source else None
data_directory = "./data/bucket-export/{}/{}/".format(
EXPORT_TAG, data_type
)
parse_dataflow_export(
data_directory,
"./data/parsed-export/{}/parsed_{}.json".format(EXPORT_TAG, data_type),
data_parser,
)