-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtwarc2ToCSV.py
155 lines (109 loc) · 5.62 KB
/
twarc2ToCSV.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import argparse
import json
import csv
import dateutil.parser
# This script processes a the output of twarc2 json results file.
# It output the data in a easier to use csv file, with full tweet text for retweets
# and the fuller username of a user.
# Sample twarc2 command:
# twarc2 search --start-time 2021-12-01 --end-time 2021-12-07 --limit 1000 --max-results 100 --archive "snow Toronto" snow.jsonl
def processJson(jsonFile,outputFile,verbose):
jsonData = []
# load json file from twarc
with open(jsonFile) as f:
for line in f :
jsonData.append(json.loads(line))
count = 1
# create list of csv header row values
csvHeader = ['id','userid', 'username', 'createdDate', 'conversationId',\
'tweetText', 'retweet', 'reply', 'like','quotes',"isRetweet"]
with open(outputFile, 'w', encoding='UTF8', newline='') as f:
writer = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
# write the header row to the csv file
writer.writerow(csvHeader)
for jsonObj in jsonData:
# create two lists containing additional user information and tweet information
# this additional data is contained in the includes section of the twarc json output
userList = []
tweetList = []
if "includes" in jsonObj:
if "users" in jsonObj["includes"]:
for user in jsonObj["includes"]["users"]:
userList.append({"userId":user["id"], "username":user["username"]})
if "tweets" in jsonObj["includes"]:
for tweetInclude in jsonObj["includes"]["tweets"]:
tweetList.append({"tweetId":tweetInclude["id"], "fullText":tweetInclude["text"]})
# read the tweet objects from the twarc json objects
for tweet in jsonObj["data"]:
# set default values for the variables being written to the CSV
id = 0
tweetText = ""
username = ""
userid = 0
newDate = "1900-01-01 00:00:00"
retweets = -100
replys = -100
likes = -100
quotes = -100
conversationId = 0
isRetweet = False
count = count + 1
if "id" in tweet:
id = tweet["id"]
if "text" in tweet:
tweetText = tweet["text"]
if "author_id" in tweet:
# look up full username in our username list from the includes section of the json file
# otherwise you are left with only the numeric user id
searchUser = next((item for item in userList if item["userId"] == tweet["author_id"]), None)
username = searchUser["username"]
userid = tweet["author_id"]
if "created_at" in tweet:
oldDate = dateutil.parser.parse(tweet["created_at"])
newDate = (oldDate.strftime("%Y-%m-%d %H:%M:%S"))
if "public_metrics" in tweet :
retweets = tweet["public_metrics"]["retweet_count"]
replys = tweet["public_metrics"]["reply_count"]
likes = tweet["public_metrics"]["like_count"]
quotes = tweet["public_metrics"]["quote_count"]
if "conversation_id" in tweet:
conversationId = tweet["conversation_id"]
# if the tweet object contains a referenced tweet object
# check to see if there is retweet information and if there is
# get the full text of the original tweet, otherwise the tweet text is truncated
if "referenced_tweets" in tweet:
for refTweet in tweet["referenced_tweets"]:
if refTweet["type"] == "retweeted":
isRetweet = True
searchTweet = next((item for item in tweetList if item["tweetId"] == refTweet["id"]), None)
tweetText = searchTweet["fullText"]
else:
isRetweet = False
# if verbose is set to True print individual tweet data while processing
if(verbose):
print("==================== Tweet {} ========================".format(count))
print("Tweet ID: {} User ID: {} Username: {} Date: {} Tweet text: {}\n".format(id, userid,username,newDate,tweetText))
# create a list of values for the csv writer
tweetData = [id, userid, username, newDate,conversationId, tweetText, retweets, replys, likes, quotes,isRetweet]
writer.writerow(tweetData)
print("Processed {} tweets. Output written to: {}".format(count,outputFile))
def main():
# process command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("-j", "--json")
parser.add_argument("-o", "--output")
parser.add_argument("-v", "--verbose",action='store_true')
args = parser.parse_args()
if args.json:
jsonFile = args.json
else:
print("Please include a json file using the -j <FILE> command line arguument")
if args.output:
outputFile = args.output
else:
print("Please include a CSV output file using the -o <FILE> command line arguument")
# if both command line arguments are present process JSON file from twarc
if args.json and args.output:
processJson(jsonFile,outputFile,args.verbose)
if __name__ == "__main__":
main()