-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbias_calculator.py
205 lines (145 loc) · 5.95 KB
/
bias_calculator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# importing module
from pandas import *
import math
min_samples = 1
# Reads in CSV and processes it slightly
def read(file, cumulative):
data = read_csv(file)
data.fillna(0, inplace=True)
if cumulative== True:
data = make_df_cumulative(data)
return data
# This function takes a CSV and feeds its columns into get_bias
# Separated by age
def calculate_columns(data):
max_age = 10
age_counter = 0
bias_array = []
type_array = []
token_array = []
overlap_array = []
while age_counter <max_age + 1:
null_name = str(age_counter) + "null"
overt_name = str(age_counter) + "overt"
null_col = data[null_name].tolist()
overt_col = data[overt_name].tolist()
bias, type_count, token = get_bias(null_col, overt_col)
overlap = get_overlap(null_col, overt_col)
bias_array.append(bias)
type_array.append(type_count)
token_array.append(token)
overlap_array.append(overlap)
age_counter += 1
return bias_array, type_array, token_array, overlap_array
# This function finds the empirical overlap between two forms.
def get_overlap(col1, col2):
min_sample = 1
type_counter = 0
overlap_counter = 0
if len(col1) == len(col2):
col_height = len(col1)
for a in range(col_height):
if float(col1[a]) + float(col2[a]) >= min_sample:
type_counter += 1
if float(col1[a]) > 0 and float(col2[a]) > 0:
overlap_counter += 1
if type_counter == 0:
overlap_ratio = 0
else:
overlap_ratio = overlap_counter/type_counter
return overlap_ratio
# This function adds up the favored form in each row between the two columns,
# Then divides that by the total tokens
def get_bias(col1, col2):
total_tokens = 0
total_fav = 0
min_sample = 1
type_counter = 0
if len(col1) == len(col2):
col_height = len(col1)
for a in range(col_height):
#if a == 0:
# # This skips "say"
# continue
if float(col1[a]) + float(col2[a]) >= min_sample:
type_counter += 1
total_tokens = total_tokens + float(col1[a]) + float(col2[a])
total_fav = total_fav + max(float(col1[a]), float(col2[a]))
if total_tokens >0:
bias = round(total_fav/total_tokens,2)
else:
bias = 0
return bias, type_counter, total_tokens
else:
print("Columns are not the same size")
return "n/a", "n/a", "n/a"
# Makes the df counts cumulative across age
def make_df_cumulative(data):
age_counter = 1
null_cumulative = data["0null"]
overt_cumulative = data["0overt"]
while age_counter < 11:
null_name = str(age_counter) + "null"
overt_name = str(age_counter) + "overt"
null_cumulative = data[null_name] + null_cumulative
data[null_name] = null_cumulative
overt_cumulative = data[overt_name] + overt_cumulative
data[overt_name] = overt_cumulative
age_counter += 1
return data
# Calculates the expected overlap between determiners (or complementizers in this case)
def expected_value(B, N, S):
if N == 0:
return 0
harm_n = 0
# Get probability using Zipf's Law
for i in range(N):
# Harmonic series until N-- checked that this is good
harm_n += 1/(i+1)
expected_sum = 0
for r in range(N):
# Probability of noun r occuring a corpus of N word types
p_r = 1/((r+1)*harm_n)
#print(p_r)
#print(S*1/((r+1)*h_n))
# Simulate S iterations of the verb with determiner combinations to get expected amount of overlap for this verb
# Second term- prob that verb is not sampled during S trials, Third term- prob that verb is exclusively with ith determiner.
# E_r = chance that there will be overlap for rth verb
E_r = 1 - math.pow((1-p_r),S)- (math.pow((B*p_r + 1 - p_r),S)- math.pow((1-p_r),S))- (math.pow(((1-B)*p_r + 1 - p_r),S)- math.pow((1-p_r),S))
#print(math.pow(B*p_r + 1 - p_r,S))
#print(E_r)
expected_sum += E_r
# Average across different verbs
expected_val = expected_sum/N
return expected_val
# The whole process, performed and written to a csv file
def csv_to_bias(file):
if "dir" in file:
df = read(file, True)
else:
df = read(file, False)
bias_array, type_array, token_array, overlap_array = calculate_columns(df)
pred_array = []
for i in range(len(bias_array)):
pred_array.append(expected_value(bias_array[i], type_array[i], token_array[i]))
filename = file[0:-4] + "_2predictions.csv"
with open(filename,"w") as outfile:
## Write the first line of the csv so we know what we're looking at
first_line = "age, bias, N, S, pred, empirical\n"
outfile.write(first_line)
for age in range(len(bias_array)):
next_line = str(age)+ "," + str(bias_array[age]) + "," + str(type_array[age]) + "," + str(token_array[age]) + "," + str(pred_array[age]) + "," + str(overlap_array[age])+ "\n"
outfile.write(next_line)
return bias_array, type_array, token_array, pred_array
# Testing bias
#the_val = [5,30,20,5]
#a_val = [20,10,10,0]
#print(get_bias(the_val, a_val))
file_list = ["english_child.csv", "english_child_dir.csv", "spanish_child.csv", "spanish_child_dir.csv"]
for file in file_list:
bias_array, type_array, token_array, pred_array = csv_to_bias(file)
#expected_array = []
#for i in range(len(bias_array)):
# expected_array.append(expected_value(bias_array[i], type_array[i], token_array[i]))
#print(expected_array)
#print(expected_value(0.826, 363, 1472))