-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimulation_study.py
317 lines (259 loc) · 13.3 KB
/
simulation_study.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 24 19:04:54 2022
@author: Max Temmerman and Marcio Reverbel
"""
import os
import numpy as np
import numpy.linalg as la
import pandas as pd
import scipy as sp
import scipy.stats as stats
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.tsa.api as tsa
import math
from helper import print_question, data_frame_to_latex_table_file
#------------------------------------------------------------------------------
def beta_hist(beta_hat, beta_avg, beta_std, num_std, i, j, fignum):
"""
This is a function to print the histgrams for the OLS estimates more easily.
It helps to iterate through each of the error scenarios and each of the coefficients.
"""
text = [r'$\hat{\beta}$ with (n = 10, ${\epsilon}$ ~ N(0, ${\sigma^2}$))',
r'$\hat{\beta}$ with (n = 100, ${\epsilon}$ ~ N(0, ${\sigma^2}$))',
r'$\hat{\beta}$ with (n = 10, ${\epsilon}$ ~ Laplace(0, ${\sigma^2}$))',
r'$\hat{\beta}$ with (n = 100, ${\epsilon}$ ~ Laplace(0, ${\sigma^2}$))']
fig, ax = plt.subplots(1, 1, num=fignum)
bins = np.linspace(beta_avg[i][j] - num_std * beta_std[i][j], beta_avg[i][j] + num_std * beta_std[i][j], 100)
ax.hist(beta_hat[i, j], bins, density=True)
ax.set_xlabel(r'$\beta_{}$'.format(i))
ax.set_title(text[j])
plt.savefig(figure_dir + 'figure_{}.png'.format(fignum))
plt.show()
return fignum + 1
#-----------------------------------------------------------------------------
def t_hist(t_val, t_avg, t_std, num_std, i, j, fignum):
"""
This is a function to print the histgrams of the t_values more easily.
It helps to iterate through each of the error scenarios and each of the coefficients.
"""
text = [r't-values with (n = 10, ${\epsilon}$ ~ N(0, ${\sigma^2}$))',
r't-values with (n = 100, ${\epsilon}$ ~ N(0, ${\sigma^2}$))',
r't-values with (n = 10, ${\epsilon}$ ~ Laplace(0, ${\sigma^2}$))',
r't-values with (n = 100, ${\epsilon}$ ~ Laplace(0, ${\sigma^2}$))']
fig, ax = plt.subplots(1, 1, num=fignum)
bins = np.linspace(t_avg[i][j] - num_std * t_std[i][j], t_avg[i][j] + num_std * t_std[i][j], 100)
ax.hist(t_val[i, j], bins, density=True)
ax.plot(bins, stats.t.pdf(bins, n_obs[j]-3), lw = 2, label = 't')
ax.plot(bins, stats.norm.pdf(bins), label = 'N')
ax.grid(True, ls=':')
ax.set_xlabel(r't-values for $\beta_{}$'.format(i))
ax.set_title(text[j])
ax.legend(loc='best')
plt.savefig(figure_dir + 'figure_{}.png'.format(fignum))
plt.show()
return fignum + 1
#------------------------------------------------------------------------------
def f_hist(f_val, f_avg, f_std, num_std, i, fignum):
"""
This is a function to print the histgrams of the f_values more easily.
It helps to iterate through each of the error scenarios.
"""
text = [r'Model test (n = 10, ${\epsilon}$ ~ N(0, ${\sigma^2}$))',
r'Model test (n = 100, ${\epsilon}$ ~ N(0, ${\sigma^2}$))',
r'Model test (n = 10, ${\epsilon}$ ~ Laplace(0, ${\sigma^2}$))',
r'Model test (n = 100, ${\epsilon}$ ~ Laplace(0, ${\sigma^2}$))']
lab = [r'${F_{2,7}}$', r'${F_{2,97}}$', r'${F_{2,7}}$', r'${F_{2,97}}$', r'${X{^2_2}}$']
fig, ax = plt.subplots(1, 1, num=fignum)
# IMPORTANT: These graphics don't construct well for n = 10 because of the enourmous difference
# in scale between the histogram data and the distributions. If you change the number of bins below, from
# 150 to 500, for example, it's possible to see the distribution in those two cases (Laplace and Normal,
# with n = 10), but the histograms don't look good. Basically, now the bins are too small to show the portion
# of the pdf that is farther from the horizontal line (x = 0).
bins = np.linspace(f_avg[i] - num_std * f_std[i], f_avg[i] + num_std * f_std[i], 150)
ax.hist(f_val[i], bins, density=True)
ax.plot(bins, stats.f.pdf(bins, 2, n_obs[i]-2), label=lab[i])
ax.plot(bins, stats.chi2.pdf(bins, 2), label = lab[4])
ax.grid(True, ls=':')
ax.set_xlabel(r'f-values for the model test: $\beta_0$ = $\beta_1$ = $\beta_2$ = 0')
ax.set_title(text[i])
ax.legend(loc='best')
plt.savefig(figure_dir + 'figure_{}.png'.format(fignum))
plt.show()
return fignum + 1
#-----------------------------------------------------------------------------
def eps_hist(eps, j, fignum):
"""
This is a function to print the histgrams of the error terms more easily.
It helps to iterate through each of the error types and each of the coefficients.
"""
tit = [r'${\epsilon}$ ~ N(0, 0.25${^2}$), n = 10',
r'${\epsilon}$ ~ N(0, 0.25${^2}$), n = 100',
r'${\epsilon}$ ~ Laplace(0, 0.25${^2}$), n = 10',
r'${\epsilon}$ ~ Laplace(0, 0.25${^2}$), n = 100']
fig, ax = plt.subplots(1, 1, num=fignum)
ax.hist(eps.flatten(), 100, density=True)
ax.set_xlabel(r'${\epsilon}$')
ax.set_title(tit[j])
plt.savefig(figure_dir + 'figure_{}.png'.format(fignum))
plt.show()
return fignum + 1
#------------------------------------------------------------------------------
# Adjusting the working directory
cwd = os.getcwd()
print('Current Working Directory is: ', cwd, '\n')
"""
absolute_path = '../python'
os.chdir(absolute_path)
print('New working directory is: ', os.getcwd(), '\n')
"""
#------------------------------------------------------------------------------
# Set the folders for output of graphs and tables
# -----------------------------------------------------------------------------
# for the figures
figure_dir = '../figures/'
if not os.path.exists(figure_dir):
os.makedirs(figure_dir)
# for the latex document
report_dir = '../report/'
if not os.path.exists(report_dir):
os.makedirs(report_dir)
#------------------------------------------------------------------------------
# Code starts here!
#------------------------------------------------------------------------------
bd_1 = 1106
bd_2 = 706
group_seed = bd_1*bd_2
rng = np.random.default_rng(group_seed)
fignum = 1
""" QUESTION 1 - Creating the matrix of observations """
mu = {}
mu["x1"] = 5
mu["x2"] = 0
std = {}
std["x1"] = 1
std["x2"] = 2
rho = 0.5
# building the covariance matrix between x1 and x2:
cov = np.array([[pow(std["x1"], 2), 0.5*std["x1"]*std["x2"]],[0.5*std["x1"]*std["x2"], pow(std["x2"], 2)]])
x1, x2 = rng.multivariate_normal([mu["x1"], mu["x2"]], cov, 100).T
x = np.vstack([np.ones(100), x1, x2]).T
""" QUESTION 2 - Generating error terms for each observation """
#------------------------------------------------------------------------------
# The real values of the coefficients. This is known only to the "creator".
beta = np.array([10, 0.5, 2.2])
sigma= 0.25
# the number of observations was copied twice into the tuple to make it easier to iterate
# when making calculations and passing information to functions.
n_obs = (10, 100, 10, 100)
n_sim = pow(2, 16)
# Now we generate the errors and place them into two dictionaries of lists, separated by distribution.
eps = {}
eps["normal"] = [np.empty((n_obs[0], n_sim)), np.empty((n_obs[1], n_sim))]
eps["laplace"] = [np.empty((n_obs[0], n_sim)), np.empty((n_obs[1], n_sim))]
eps["normal"][0] = rng.normal(0, sigma, (n_obs[0], n_sim))
eps["normal"][1] = rng.normal(0, sigma, (n_obs[1], n_sim))
eps["laplace"][0] = rng.laplace(0, sigma/np.sqrt(2), (n_obs[0], n_sim))
eps["laplace"][1] = rng.laplace(0, sigma/np.sqrt(2), (n_obs[1], n_sim))
""" QUESTION 3 - Generating the dependent variable """
#-----------------------------------------------------------------------------
# We create the X for 10 observations and the X for 100 observations in the temp variable.
temp = [x[0:10, :] @ beta, x @ beta]
y = [np.empty((n_obs[0], n_sim)), np.empty((n_obs[1], n_sim)), np.empty((n_obs[2], n_sim)), np.empty((n_obs[3], n_sim))]
# We create the y variable using the appropriate errors and x (temp).
y[0] = np.add(eps["normal"][0].T, temp[0]).T
y[1] = np.add(eps["normal"][1].T, temp[1]).T
y[2] = np.add(eps["laplace"][0].T, temp[0]).T
y[3] = np.add(eps["laplace"][1].T, temp[1]).T
#------------------------------------------------------------------------------
""" QUESTION 4 - Plotting the error terms """
# Here we iterate through all simulation scenarios to create the relevant histograms.
# See the eps_hist function for more details.
i = 0
for value in eps.values():
fignum = eps_hist(value[0], i, fignum)
fignum = eps_hist(value[1], i + 1, fignum)
i =+ 2
""" QUESTION 5 - Calculate the OLS estimates, t-tests, and model test """
# We created a lambda function that generates the regression with inputs being simulation scenario and simulation number.
# It works, but extremely slow. To make the code faster, we've calculated the OLS estimates and t-values manually.
# The code for using the OLS.results.params and results.t_test is there, but commented. If you wish to use it,
# un-comment all the lines starting with '###'. It will take some 7 minutes to run.
# The np.isclose() function was used to compare the results from manual calculations and the functional outputs
# for the OLS estimates and t values. All values match.
r = lambda i, s: sm.OLS(y[i][:, s], x[:n_obs[i]]).fit()
###beta_OLS = np.empty((3, 4, n_sim))
###t_val_OLS = np.empty((3, 4, n_sim))
f_val = np.empty((4, n_sim))
for s in range(n_sim):
for i in range(4):
results = r(i, s)
### beta_OLS[0, i, s], beta_OLS[1, i, s], beta_OLS[2, i, s] = results.params
### t_val_OLS[0, i, s], t_val_OLS[1, i, s], t_val_OLS[2, i, s], = results.t_test('const = 10, x1 = 0.5, x2 = 2.2').tvalue
f_val[i, s] = results.fvalue
# Calculating the estimates manually
beta_hat = {}
beta_hat["normal"] = [np.empty((3, n_sim)).T, np.empty((3, n_sim)).T]
beta_hat["laplace"] = [np.empty((3, n_sim)).T, np.empty((3, n_sim)).T]
for i in range(0, n_sim):
beta_hat["normal"][0][i] = la.inv(x[0:10].T @ x[0:10]) @ x[0:10].T @ y[0][:, i]
beta_hat["normal"][1][i] = la.inv(x.T @ x) @ x.T @ y[1][:, i]
beta_hat["laplace"][0][i] = la.inv(x[0:10].T @ x[0:10]) @ x[0:10].T @ y[2][:, i]
beta_hat["laplace"][1][i] = la.inv(x.T @ x) @ x.T @ y[3][:, i]
sim = (beta_hat["normal"][0], beta_hat["normal"][1], beta_hat["laplace"][0], beta_hat["laplace"][1])
b0 = np.array([sim[0][:, 0], sim[1][:, 0], sim[2][:, 0], sim[3][:, 0]])
b1 = np.array([sim[0][:, 1], sim[1][:, 1], sim[2][:, 1], sim[3][:, 1]])
b2 = np.array([sim[0][:, 2], sim[1][:, 2], sim[2][:, 2], sim[3][:, 2]])
b_hat = np.array([b0, b1, b2])
###print(np.isclose(beta_OLS, b_hat))
###print('\n')
#------------------------------------------------------------------------------
# Calculating the t-tests manually
# Here we develop a lambda function that allows us to easily calculate y_hat for each simulation scenario
y_hat = (lambda i : x[0:n_obs[i]] @ [b0[i], b1[i], b2[i]])
e = [np.empty((n_obs[0], n_sim)), np.empty((n_obs[1], n_sim)), np.empty((n_obs[0], n_sim)), np.empty((n_obs[1], n_sim))]
for i in range(4):
e[i] = y[i] - y_hat(i)
t_val = np.empty((3, 4, n_sim))
for i in range(4):
for j in range(3):
""" Here we are calculating the t-value with the formula learned in class: t = (B_hat - Bnull)/SE(index j)
with SE = 1/sqrt(n-k) * e'e * sqrt(X'X). It looks slightly more complicated because it is being
calculated for all betas (j) and simulation scenarios (i)."""
t_val[j, i] = (b_hat[j][i] - beta[j])/np.sqrt(np.sum(e[i]**2, 0)/(n_obs[i]-3)*
la.inv(x[0:n_obs[i]].T @ x[0:n_obs[i]])[j, j])
#OBS: degrees of freedom = n - k - 1, n_obs - number of independent variables - 1.
###print(np.isclose(t_val_OLS, t_val))
print("b_hat dimensions: {}\nt_val dimensions: {}\nf_val dimensions: {}".format(b_hat.shape, t_val.shape, f_val.shape))
#------------------------------------------------------------------------------
""" QUESTION 6, 7, 8 - Creating histograms for each of the quantities found,
and superposing the t, f, normal and chi-square distributions over the relevant
histograms. Consult the functions at the top of the document for more information.
"""
# calculate the average of all the OLS estimates
beta_avg = np.array([np.mean(b_hat[0], 1), np.mean(b_hat[1], 1), np.mean(b_hat[2], 1)])
# same for the standard deviation
beta_std = np.array([np.std(b_hat[0], 1), np.std(b_hat[1], 1), np.std(b_hat[2], 1)])
# used for the bins in the histogram later on
num_std = 3.5
# create figures:
# Using the beta_hist function to plot all 12 histograms for the OLS estimates
for i in range(3):
for j in range(4):
fignum = beta_hist(b_hat, beta_avg, beta_std, num_std, i, j, fignum)
# calculate the average of all the OLS estimates
t_avg = np.array([np.mean(t_val[0], 1), np.mean(t_val[1], 1), np.mean(t_val[2], 1)])
# same for the standard deviation
t_std = np.array([np.std(t_val[0], 1), np.std(t_val[1], 1), np.std(t_val[2], 1)])
# Using the t_hist function to plot all 12 histograms for the OLS estimates
for i in range(3):
for j in range(4):
fignum = t_hist(t_val, t_avg, t_std, num_std, i, j, fignum)
# Using the f_hist function to plot all 4 histograms of the F values:
f_avg = np.mean(f_val, 1)
f_std = np.std(f_val, 1)
for i in range(4):
fignum = f_hist(f_val, f_avg, f_std, num_std, i, fignum)
#------------------------------------------------------------------------------