-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfile_creation.py
198 lines (153 loc) · 8.45 KB
/
file_creation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
from os.path import isfile
from numpy import array as nparr
from numpy import quantile
from file_manip import file_generator, append_objs_to_file, read_file, write_text
from analysis_tools import get_file_paths, compute_relative_time, load_conv_followers
def make_unique_flw_file(conv_ids: list[str], read_from_external_disk = False):
"""Creates a file of the follower counts for unique users in
the conversations provided in the list. Will not write files
for conversations for which such files exist, or where data
is missing.
Args:
- conv_ids: list of conversation IDs
- read_from_external_disk: If True, reads conversation
data from the external disk file structure.
"""
convs_proc, follower_file_existed, missing_data = 0, 0, 0
for c_id in conv_ids:
comp_follower_file = f'follower_distr/unique_followers/{c_id}_unique_follower_counts.txt'
if isfile(comp_follower_file):
follower_file_existed += 1
continue
if read_from_external_disk:
conv_path = f'E:/tweets/conversations/{c_id}_conversation-tweets.jsonl'
retw_path = f'E:/tweets/retweets/{c_id}.jsonl'
quote_path = f'E:/tweets/quotes/{c_id}_quotes.jsonl'
else:
conv_path = f'sampled_conversations/{c_id}_conversation-tweets.jsonl'
retw_path = f'retweets/{c_id}.jsonl'
quote_path = f'quotes/{c_id}_quotes.jsonl'
if not isfile(conv_path) or not isfile(retw_path) or not isfile(quote_path):
missing_data += 1
continue
root = read_file(f'root_tweets/{c_id}_root.jsonl')[0]
root_flw = int(root['author']['public_metrics']['followers_count'])
root_auth_id = root['author_id']
users = {root_auth_id: root_flw}
for re in file_generator(conv_path):
users[re['author_id']] = re['author']['public_metrics']['followers_count']
for rt in file_generator(retw_path):
users[rt['author_id']] = rt['author']['public_metrics']['followers_count']
for q in file_generator(quote_path):
users[q['author_id']] = q['author']['public_metrics']['followers_count']
convs_proc += 1
with open(comp_follower_file, 'a+') as filehandle:
for u_id, flw in users.items():
filehandle.write('%d\n' % flw)
print(f'Processed {convs_proc} conversations. {follower_file_existed} already existed, and in {missing_data} data files are missing.')
return
def make_infl_id_file(conv_ids: list[str], threshold_percentile: float) -> None:
"""Writes influencer IDs and follower count to file. Does
not create a file in case no influencers are found, or in
the case that such a file already exists, or data is either
sparse (<50 interactions) or missing.
Args:
- conv_ids: list of conversation IDs
- threshold_percentile: Percentile (divided by 100) of local/global follower
distribution used to compute the threshold. Will be part of the file name.
"""
processed, already_exists, missing_data, too_few = 0, 0, 0, 0
for conv_id in conv_ids:
threshold_percentile = str(threshold_percentile).replace('.', '')
infl_id_path = f'infl_ids/{conv_id}_{threshold_percentile}.txt'
infl_nflw_path = f'infl_nflws/{conv_id}_nflws_{threshold_percentile}.txt'
if isfile(infl_id_path):
already_exists += 1
continue
infl_attr = dict() # set of influencer ID and number of followers, hopefully the combo is unique!
paths = [*get_file_paths(conv_id)] # TODO: option to read files from disk E:/
follower_dist_path = f'follower_distr/unique_followers/{conv_id}_unique_follower_counts.txt'
paths_exist = [isfile(p) for p in paths]
if not all(paths_exist) or not isfile(follower_dist_path):
missing_data += 1
continue
unique_follow_counts = nparr(load_conv_followers(follower_dist_path))
pval = max(0.95, 1-5/len(unique_follow_counts))
threshold = quantile(unique_follow_counts, q=pval)
counter = 0
for path in paths[1:]: # Skip root tweet
for tweet in file_generator(path):
counter += 1
if tweet['author']['public_metrics']['followers_count'] > threshold:
infl_attr[tweet['author']['id']] = tweet['author']['public_metrics']['followers_count']
if counter < 50:
too_few += 1
continue
i_ids, i_nflws = [], []
for id_,nflw_ in infl_attr.items():
i_ids.append(int(id_))
i_nflws.append(nflw_)
append_objs_to_file(infl_id_path, i_ids)
append_objs_to_file(infl_nflw_path, i_nflws)
processed += 1
print(f'Created files for {processed} conversations. Files for {already_exists} conversations already exists.')
print(f'In {missing_data} cases data was missing, and in {too_few} cases there were too few interactions.')
return
def make_infl_flwr_matching(conv_ids: list[str]) -> None:
"""Makes files containing interactions of influencers and influencer followers
for a set of conversations. Stored in the folder matched_infl. The output is
a .csv-file with influencer ID (implying that the user follows this user), user
ID, interaction time, and interaction type as columns. Interactions from influencers
are also added, where the user ID is simply the influencer user ID.
Note that all reply, retweet and quote files of the conversation must exist,
and there must be a list of follower IDs for each influencer in the conversation.
Args:
- conv_ids: a list of conversation IDs
"""
already_matched, missing_data = 0, 0
for conv_id in conv_ids:
s_path = f'matched_infl/{conv_id}_user_infl_interactions.txt'
if isfile(s_path):
already_matched += 1
continue
# Read influencer files and see where the influencers interact with the conversation.
infl_ids = read_file(f'infl_ids/{conv_id}_095.txt')
existence = nparr([isfile(f'infl_follower_ids/{i_id}_followers.txt') for i_id in infl_ids])
if not existence.all():
missing_data += 1
continue
root = read_file(f'root_tweets/{conv_id}_root.jsonl')[0]
root_time = root['created_at']
interaction = {}
# Read interaction files of time stamps and the IDs of users interacting
file_paths = [
f'sampled_conversations/{conv_id}_conversation-tweets.jsonl',
f'retweets/{conv_id}.jsonl',
f'quotes/{conv_id}_quotes.jsonl'
]
for path in file_paths:
interaction_type = path[0]
for tw in file_generator(path):
time, ID, = tw['created_at'], tw['author_id']
t = compute_relative_time(root_time, time)
if not ID in interaction:
interaction[ID] = []
interaction[ID].append((t, interaction_type))
# Read list of the followers of the influencers
# Store matching infl. follower ID pairs, time difference and interaction types.
for i_id in infl_ids:
infl_follower_file, infl_id = f'infl_follower_ids/{i_id}_followers.txt', str(i_id)
if infl_id in interaction:
infl_interactions = interaction[infl_id]
for ii in infl_interactions:
dat = '{},{},{},{}'.format(infl_id, infl_id, ii[0], ii[1])
write_text(s_path, dat)
for u_id in file_generator(infl_follower_file):
user_id = str(u_id)
if user_id in interaction:
user_interactions = interaction[user_id]
for ui in user_interactions:
dat = '{},{},{},{}'.format(infl_id, user_id, ui[0], ui[1])
write_text(s_path, dat)
print(f'Out of {len(conv_ids)} conversations, {already_matched} were already processed, and influencer follower IDs were missing for {missing_data}. Files were created for {len(conv_ids)-already_matched-missing_data} conversations.')
return