-
Notifications
You must be signed in to change notification settings - Fork 0
/
ClozeAnalysis.py
373 lines (308 loc) · 14.6 KB
/
ClozeAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
#!/usr/bin/env python
# coding: utf-8
import nltk
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.metrics.distance import edit_distance as levenshtein_distance
# we will use this function to calculate the number of non-empty responses for each target word
def length_nonempty(df_list):
length = len(df_list)
empties = df_list.count([""])
return length-empties
# exclude participants whose responses were not appropriate based on first analysis
exclude = [27, 40]
# save temporarily participants data in two different lists, corresponding to the two groups
# faster to incorprorate them in one big list later
df_1List = []
df_2List = []
# get results for group 1 (all participants in one json file)
# with open('C:/Users/User/OwnCloud/ClozeTask_sharedfolder/CLOZEresults/Group1/jatos_results_20210826185507.txt') as f:
with open('//cbsu/data/Imaging/hauk/users/fm02/EOS_data/CLOZEresults/Group1/jatos_results_20210826185507.txt') as f:
for jsonObj in f:
df_1 = pd.read_json(jsonObj)
df_1List.append(df_1)
# ... and group 2
#with open('C:/Users/User/OwnCloud/ClozeTask_sharedfolder/CLOZEresults/Group2/jatos_results_20210826140139.txt') as f:
with open('//cbsu/data/Imaging/hauk/users/fm02/EOS_data/CLOZEresults/Group2/jatos_results_20210826140139.txt') as f:
for jsonObj in f:
df_2 = pd.read_json(jsonObj)
df_2List.append(df_2)
# Create one big list containing all participants.
# Each pariticpant is a pd.DataFrame containing all the events in the experiment
# (after the instruction alternating rows about cloze and likert(plausibility) trials)
dfList = [*df_1List, *df_2List]
for ex in exclude:
dfList.pop(ex)
# initialise some useful lists
# in this list we will save each participant responses to the cloze task
clozeList = []
# in this plausibility ratings
plausiList = []
# here we save the number of words predicted fro each participant
# (to check performance)
predList = []
# here we save each word correctly predicted by the participants
wordspredicted = []
# save all the possible word IDs getting them from first and last participant - this is because there are different word IDs for words in group 1 and 2
# this creates a Series, with index=IDs, and each value as an empty list
# thi empty list will be populated with all the possible responses later
all_resp = pd.Series([[] for _ in range(len([*dfList[0]['ID'].dropna(),
*dfList[-1]['ID'].dropna()]))],
index=[*dfList[0]['ID'].dropna().astype(int),
*dfList[-1]['ID'].dropna().astype(int)])
# drop attention checks ('ID' = 999)
all_resp = all_resp.drop(999)
# same for plausibility
all_plausib = pd.Series([[] for _ in range(len([*dfList[0]['ID'].dropna(),
*dfList[-1]['ID'].dropna()]))],
index=[*dfList[0]['ID'].dropna().astype(int),
*dfList[-1]['ID'].dropna().astype(int)])
all_plausib = all_plausib.drop(999)
# loop over each participant
for df in dfList:
# get cloze trials
cloze = df[df['trial_type']=='cloze']
# keep relevant columns
cloze = cloze[['ID','target','response']]
# save cloze trials in the appropriate list, which will be used to calculate
# each word's cloze probability
clozeList.append(cloze)
# save participant responses
for index, row in cloze.iterrows():
if row['ID']!=999: #ignore the attention checks - all participants responded well enough (already checked)
all_resp[row['ID']].append(row['response'])
# get survey-likert (aka plausibility) trials
plausibility = df[df['trial_type']=='survey-likert']
plausibility = plausibility[['response']]
plausibility = plausibility.rename(columns={'response':'plausibility'})
# assign the same ID (because it's the same order - each plausibility rating follows the order of the same)
plausibility['ID'] = cloze['ID'].tolist()
plausiList.append(plausibility)
# save all plausibilities in one place (used to calculate average)
for index, row in plausibility.iterrows():
if row['ID']!=999:
all_plausib[row['ID']].append(row['plausibility'].get('Q0'))
wordpredicted = []
predicted = 0
# now let's count how many words did each participant predict
#
for i in range(len(cloze)):
# this includes ONLY the first word in the response:
# accept words starting in capital
# accepts words followed by '.' or ','
if (cloze['target'].iloc[i] ==
re.split(('\.| |,'),cloze['response'].iloc[i][0])[0].lower()):
wordpredicted.append(cloze['ID'].iloc[i].astype(int))
predicted+=1
# accept misspelled word - counting as misspelled words with
# levenshtein distance <=2 from the target word
# (some words such as 'aisle' were frequently misspelled)
else:
if levenshtein_distance(cloze['target'].iloc[i],
re.split(('\.| |,'),cloze['response'].iloc[i][0])[0].lower())<=2:
wordpredicted.append(cloze['ID'].iloc[i].astype(int))
predicted+=1
# save number of words predicted (to check each participants' performance)
predList.append(predicted)
# and which words were predicted
wordspredicted.append(wordpredicted)
# create a dataframe where to store cloze probabilities and responses
get_cloze = pd.concat([df_1[['ID','target']].dropna(),df_2[['ID','target']].dropna()])
get_cloze['cloze'] = np.zeros(shape=(len(get_cloze),1))
get_cloze['ID'] = get_cloze['ID'].astype(int)
get_cloze = get_cloze.set_index('ID')
for sub in wordspredicted:
for word in sub:
if word!=999:
get_cloze['cloze'].loc[word] +=1
get_cloze['cloze'] = get_cloze['cloze'].astype(int)
get_cloze = get_cloze.merge(all_resp.rename('all_resps'),
left_on='ID',right_on=all_resp.index)
# check how many non-empty responses are present for each target word
get_cloze_denominator = get_cloze['all_resps'].apply(length_nonempty)
get_cloze['cloze'] = get_cloze['cloze'] / get_cloze_denominator
# add 1 to each value because start counting from 0, but the likert that was displayed started from 1 (to 7)
all_plausib = pd.Series([np.array(x)+1 for x in all_plausib],index=all_plausib.index)
all_plausib = all_plausib.apply(np.mean) # return the mean for each target word
get_cloze = get_cloze.merge(all_plausib.rename('plausibility'),
left_on='ID',right_on=all_resp.index)
plt.subplots()
sns.histplot(data=get_cloze,x='cloze',bins=5);
plt.subplots()
sns.histplot(data=get_cloze,x='plausibility',bins=[1,2,3,4,5,6,7]);
##############################################################################
##############################################################################
### DIFFERENT PART OF THE SCRIPT, LET's CALUCLATE CLOZE_SEMANTIC_SIMILARITY
##############################################################################
##############################################################################
import gensim.downloader as api
from gensim.parsing import preprocessing
from nltk.corpus import stopwords
from spellchecker import SpellChecker
# load w2v trained on google news
wv = api.load('word2vec-google-news-300')
# intialised spell check
spell = SpellChecker()
# import dataframe with two columns: one has US spelling, the other UK
# this is necessary because word2vec has words in US spelling and not UK
df = pd.read_csv('C:/Users/fm02/OwnCloud/ClozeTask_sharedfolder/UK2US.csv',sep=';')
UK2US_dict = pd.Series(df.US.values,index=df.UK).to_dict()
US2UK_dict = pd.Series(df.UK.values,index=df.US).to_dict()
def replace_all(text, Udict):
""""Replace elements using UK2US or US2UK in a nested structure"""
text2 = text.copy()
for i,w in enumerate(text2):
if w in Udict:
text2[i] = Udict[w]
return text2
def clean_resp(resps):
"""This function checks that responses are ready to be feed to word2vec."""
# import stopwords from nltk
stop_words = set(stopwords.words('english'))
all_trials = []
# loop over all trials(i.e., wordIDs)
for i,trial in enumerate(resps):
# check at what point are we at
print(i)
new_trial = []
# loop over responses (from different participants)
for s in trial:
# convert list (each response is contained inside a list) to string
s = str(s[0])
s = s.lower()
s = preprocessing.strip_punctuation(s)
s = preprocessing.strip_numeric(s)
# using stop words from nltk because too many in gensim stop_word, including target words
s = nltk.word_tokenize(s)
s = replace_all(s, UK2US_dict)
# (re)create a list of words for each response
s = [word for word in s if not word in stop_words]
# find words of unknown spelling
misspelled = spell.unknown(s)
# as similarity works only with words of known spelling, try to correct all words
for i,w in enumerate(s):
if w in misspelled:
# try to correct misspelled words
s[i] = spell.correction(w)
# if a word is not included in w2v dictionary drop it
# or w2v will throw an error
if not(wv.has_index_for(s[i])):
s[i] = ''
if s!=['']:
new_trial.append(s)
all_trials.append(new_trial)
return all_trials
responses = get_cloze['all_resps']
targets = replace_all(get_cloze['target'], UK2US_dict)
# prepare responses to be analysed in w2v (consider that w2v is US not UK)
tok = clean_resp(responses)
word_similarities = pd.DataFrame(index=np.arange(400),columns=['Word','Sim'])
word_similarities['Word']=targets
# for each trial (i.e., wordID)
for i,t in enumerate(tok):
similarities = []
# for each response in that trial
for list_words in t:
# check this is not empty
if list_words:
# calculare similarities with target word and append the word that
# is most similar to the target word.
similarities.append(max([wv.similarity(w,targets[i])
for w in list_words if wv.has_index_for(w)]))
# calculate now the average similarity between responses and the target word
# this is our measure of CLOZE SEMANTIC SIMILARITY
word_similarities['Sim'].loc[word_similarities['Word']==targets[i]] = np.mean(similarities)
# distribution is less skewed the normal cloze distribution
plt.subplots()
sns.histplot(data=word_similarities,x='Sim',bins=10);
# get back to UK spelling
word_similarities['Word'] = replace_all(word_similarities['Word'], US2UK_dict)
# ## Now let's look how cloze fits with the previous words characteristics
# first, load the
stimuli = pd.read_excel('C:/Users/fm02/OwnCloud/Sentences/stimuli_ALL.xlsx', engine='openpyxl',
usecols=['ID',
'Word',
'Sentence',
'ConcM',
'LEN',
'UN2_F',
'UN3_F',
'Orth',
'OLD20',
'FreqCount',
'LogFreq(Zipf)',
'V_MeanSum',
'A_MeanSum',
'mink3_SM',
'Position',
'BLP_rt',
'BLP_accuracy',
'similarity',
'PRECEDING_Frequency',
"PRECEDING_LogFreq(Zipf)",
"LENprec",
'AoA',
'Predictability'])
stimuli = stimuli.merge(get_cloze[['cloze','plausibility']],
left_on='Word',right_on=get_cloze['target'])
corrStimuli2 = pd.merge(stimuli,word_similarities, on='Word' )
# corrStimuli2.to_excel('C:/Users/fm02/OwnCloud/Sentences/stimuli_all_onewordsemsim.xlsx',index=False)
corrStimuli2 = corrStimuli2[['ConcM',
'LEN',
'UN2_F',
'UN3_F',
'Orth',
'OLD20',
'FreqCount',
'LogFreq(Zipf)',
'V_MeanSum',
'A_MeanSum',
'mink3_SM',
'Position',
'BLP_rt',
'BLP_accuracy',
'similarity',
'PRECEDING_Frequency',
"PRECEDING_LogFreq(Zipf)",
"LENprec",
'Predictability',
'cloze',
'plausibility',
'Sim']]
corrStimuli2['Sim'] = pd.to_numeric(corrStimuli2['Sim'])
corrmat2 = corrStimuli2.corr()
corrmatlabels2 = ["Concreteness",
"#letters",
"BigramFreq",
"TrigramFreq",
"OrthN",
"OLD20",
"Frequency",
"LogFreq(Zipf)",
"Valence",
"Arousal",
"SensorimotorStrength",
"PositionSent",
"BLP_rt",
"BLP_accuracy",
"Semantic_Similarity_CONTEXT",
"PRECEDING_Frequency",
"PRECEDING_LogFreq(Zipf)",
"PRECEDING_#letters",
"Apriori_Predictability",
"Cloze_Probability",
"Plausibility",
"Semantic_Similarity_CLOZE"
]
# in heatmap, indicate exact number in correlations >0.3
labelscorrmat2 = corrmat2.applymap(lambda v: str(round(v,2)) if (1 > abs(v) > 0.3) else '')
plt.subplots(1, 1, figsize = (15, 10))
sns.heatmap(corrmat2, cmap="Spectral_r", annot=labelscorrmat2,
annot_kws={'fontsize':9}, fmt='', vmin=-1, vmax=1,
xticklabels=corrmatlabels2,
yticklabels=corrmatlabels2)
plt.title("Correlations matrix - all sentences")
plt.show()