-
Notifications
You must be signed in to change notification settings - Fork 1
/
gaussian_code_exercise.py
258 lines (171 loc) · 8.06 KB
/
gaussian_code_exercise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#!/usr/bin/env python
# coding: utf-8
# # Gaussian Code Exercise
#
# Read through the code below and fill out the TODOs. You'll find a cell at the end of the Jupyter notebook containing unit tests. After you've run the code cell with the Gaussian class, you can run the final cell to check that your code functions as expected.
#
# This exercise includes a file called 'numbers.txt', which you can see if you click on the 'Jupyter' icon at the top of the workspace and then go into the folder titled 3.OOP_code_gaussian_class. The 'numbers.txt' file is read in by the read_data_file() method. There is also a solution in the 3.OOP_code_gaussian_class folder in a file called answer.py.
# In[1]:
import math
import matplotlib.pyplot as plt
class Gaussian():
""" Gaussian distribution class for calculating and
visualizing a Gaussian distribution.
Attributes:
mean (float) representing the mean value of the distribution
stdev (float) representing the standard deviation of the distribution
data_list (list of floats) a list of floats extracted from the data file
"""
def __init__(self, mu = 0, sigma = 1):
self.mean = mu
self.stdev = sigma
self.data = []
def calculate_mean(self):
"""Method to calculate the mean of the data set.
Args:
None
Returns:
float: mean of the data set
"""
#TODO: Calculate the mean of the data set. Remember that the data set is stored in self.data
# Change the value of the mean attribute to be the mean of the data set
# Return the mean of the data set
# pass - pass is null operation that tell me where my will be written as shown below
avg = 1.0 * sum(self.data) / len(self.data)
self.mean = avg
return self.mean
def calculate_stdev(self, sample=True):
"""Method to calculate the standard deviation of the data set.
Args:
sample (bool): whether the data represents a sample or population
Returns:
float: standard deviation of the data set
"""
# TODO:
# Calculate the standard deviation of the data set
#
# The sample variable determines if the data set contains a sample or a population
# If sample = True, this means the data is a sample.
# Keep the value of sample in mind for calculating the standard deviation
#
# Make sure to update self.stdev and return the standard deviation as well
#pass
if sample:
n = len(self.data) - 1
else:
n = len(self.data)
mean = self.mean
sigma = 0
for d in self.data:
sigma += (d - mean) ** 2
sigma = math.sqrt(sigma / n)
self.stdev = sigma
return self.stdev
def read_data_file(self, file_name, sample=True):
"""Method to read in data from a txt file. The txt file should have
one number (float) per line. The numbers are stored in the data attribute.
After reading in the file, the mean and standard deviation are calculated
Args:
file_name (string): name of a file to read from
Returns:
None
"""
# This code opens a data file and appends the data to a list called data_list
with open(file_name) as file:
data_list = []
line = file.readline()
while line:
data_list.append(int(line))
line = file.readline()
file.close()
# TODO:
# Update the self.data attribute with the data_list
# Update self.mean with the mean of the data_list.
# You can use the calculate_mean() method with self.calculate_mean()
# Update self.stdev with the standard deviation of the data_list. Use the
# calcaulte_stdev() method.
self.data = data_list
self.mean = self.calculate_mean()
self.stdev = self.calculate_stdev(sample)
def plot_histogram(self):
"""Method to output a histogram of the instance variable data using
matplotlib pyplot library.
Args:
None
Returns:
None
"""
# TODO: Plot a histogram of the data_list using the matplotlib package.
# Be sure to label the x and y axes and also give the chart a title
plt.hist(self.data)
plt.title("Histogram of Data")
plt.xlabel("data")
plt.ylabel("count")
def pdf(self, x):
"""Probability density function calculator for the gaussian distribution.
Args:
x (float): point for calculating the probability density function
Returns:
float: probability density function output
"""
# TODO: Calculate the probability density function of the Gaussian distribution
# at the value x. You'll need to use self.stdev and self.mean to do the calculation
# pass
return (1.0 / (self.stdev * math.sqrt(2*math.pi))) * math.exp(-0.5*((x - self.mean) / self.stdev) ** 2)
def plot_histogram_pdf(self, n_spaces = 50):
"""Method to plot the normalized histogram of the data and a plot of the
probability density function along the same range
Args:
n_spaces (int): number of data points
Returns:
list: x values for the pdf plot
list: y values for the pdf plot
"""
#TODO: Nothing to do for this method. Try it out and see how it works.
mu = self.mean
sigma = self.stdev
min_range = min(self.data)
max_range = max(self.data)
# calculates the interval between x values
interval = 1.0 * (max_range - min_range) / n_spaces
x = []
y = []
# calculate the x values to visualize
for i in range(n_spaces):
tmp = min_range + interval*i
x.append(tmp)
y.append(self.pdf(tmp))
# make the plots
fig, axes = plt.subplots(2,sharex=True)
fig.subplots_adjust(hspace=.5)
axes[0].hist(self.data, density=True)
axes[0].set_title('Normed Histogram of Data')
axes[0].set_ylabel('Density')
axes[1].plot(x, y)
axes[1].set_title('Normal Distribution for \n Sample Mean and Sample Standard Deviation')
axes[0].set_ylabel('Density')
plt.show()
return x, y
# In[2]:
# Unit tests to check your solution
import unittest
class TestGaussianClass(unittest.TestCase):
def setUp(self):
self.gaussian = Gaussian(25, 2)
def test_initialization(self):
self.assertEqual(self.gaussian.mean, 25, 'incorrect mean')
self.assertEqual(self.gaussian.stdev, 2, 'incorrect standard deviation')
def test_pdf(self):
self.assertEqual(round(self.gaussian.pdf(25), 5), 0.19947, 'pdf function does not give expected result')
def test_meancalculation(self):
self.gaussian.read_data_file('numbers.txt', True)
self.assertEqual(self.gaussian.calculate_mean(), sum(self.gaussian.data) / float(len(self.gaussian.data)), 'calculated mean not as expected')
def test_stdevcalculation(self):
self.gaussian.read_data_file('numbers.txt', True)
self.assertEqual(round(self.gaussian.stdev, 2), 92.87, 'sample standard deviation incorrect')
self.gaussian.read_data_file('numbers.txt', False)
self.assertEqual(round(self.gaussian.stdev, 2), 88.55, 'population standard deviation incorrect')
tests = TestGaussianClass()
tests_loaded = unittest.TestLoader().loadTestsFromModule(tests)
unittest.TextTestRunner().run(tests_loaded)
# In[ ]: