-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathstock_analysis_functions.py
392 lines (286 loc) · 13.8 KB
/
stock_analysis_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
import pandas as pd
import pandas_datareader.data as web
import datetime as dt
import yfinance as yf
import statsmodels
import matplotlib.pyplot as plt
import seaborn as sns
# function which collects data from yahoo finance and save the in the datasets
# input: tickers, start and end data, name of the country, read
# if read=False it means we have to create the dataset, if it is True it means that we have already
# create the datasets and we only have to read them
# outputs: datasets
def data_from_yahoo(tickers, ticker_id, start, end, country, read=False):
if read==True: # already created datasets
high = pd.read_csv(f'datasets/{country}_high.csv')
low = pd.read_csv(f'datasets/{country}_low.csv')
openn = pd.read_csv(f'datasets/{country}_open.csv')
close = pd.read_csv(f'datasets/{country}_close.csv')
adj_close = pd.read_csv(f'datasets/{country}_adj_close.csv')
datasets = {'high':high, 'low':low, 'open':openn, 'close':close, 'adj_close':adj_close}
return datasets
index = web.DataReader('TM', "yahoo", start, end).index # use one random stock to get the index
# initialise datasets
high = pd.DataFrame(index=index)
low = pd.DataFrame(index=index)
openn = pd.DataFrame(index=index)
close = pd.DataFrame(index=index)
volume = pd.DataFrame(index=index)
adj_close = pd.DataFrame(index=index)
for ticker in tickers:
print(ticker)
d_ticker = web.DataReader(ticker, "yahoo", start, end)
if ticker in ticker_id.keys(): # substitude code with the easier name
ticker = ticker_id[ticker]
high[ticker] = d_ticker.High
low[ticker] = d_ticker.Low
openn[ticker] = d_ticker.Open
close[ticker] = d_ticker.Close
adj_close[ticker] = d_ticker['Adj Close']
# save data
high.to_csv(f'datasets/{country}_high.csv')
low.to_csv(f'datasets/{country}_low.csv')
openn.to_csv(f'datasets/{country}_open.csv')
close.to_csv(f'datasets/{country}_close.csv')
adj_close.to_csv(f'datasets/{country}_adj_close.csv')
datasets = {'high':high, 'low':low, 'open':openn, 'close':close, 'adj_close':adj_close}
return datasets
# function that fills NaN function
# add_Date regulates if we must add the Date column
def fill_nan (datasets, index, name, add_Date=True):
print(f"\n{name}")
for key in datasets.keys():
print(f" \n{key}: \n")
dataset = datasets[key]
# count nans:
num_nans = dataset.isna().values.sum()
print(f"Totally, in the dataset there are {num_nans} NaNs values")
if num_nans>0:
if 'Date' in dataset.columns:
dataset = dataset.drop('Date', axis=1) # drop date because the next line works only with floats
dataset = (dataset.ffill() + dataset.bfill())/2
if add_Date==True:
dataset['Date'] = index # reintegrate the index
num_nans = dataset.isna().values.sum()
print(f"After the filling thought the mean, in the dataset there are {num_nans} NaNs values")
if num_nans>0:
# the still missing values are at the beginning or at the end. fill them with the first or last non NaN value
dataset = dataset.ffill()
dataset = dataset.bfill()
print(f"After the filling of first/last values, in the dataset there are {dataset.isna().values.sum()} NaNs values")
datasets[key] = dataset # update with the new datasets withous nans
return datasets
# function that extracts the n tickers with the highest value
# return a dataset with the extracted data if Names=False or the keys if Names=TRue
def extract_max(data, n, Names=True):
highest_mean = data.mean(axis = 0).sort_values(ascending=False) [:n]
if Names==True:
highest_name = highest_mean.keys()
return highest_name
else:
highest_data = data[highest_mean.keys()]
highest_data['Data'] = data['Data']
return highest_data
# function that plots time series prices
def plot_price(date_index, data, tickers, country, title, yylabel='Price'):
months = pd.to_datetime(date_index).dt.month
year = pd.to_datetime(date_index).dt.year
labels = (months.astype(str) + "-" + year.astype(str))
for name in tickers:
if name!='Date':
stock_path = data[name]
# find highest price
stock_max_date = pd.to_datetime(stock_path.idxmax())
stock_max_date_name = str(stock_max_date)[:10] # cut day and time
#plot time series
fig, ax = plt.subplots()
ax.plot(pd.to_datetime(date_index), stock_path, label=title)
# plot max
plt.axvline( x=stock_max_date, color='red', linestyle='--', alpha=0.5, label='max value at day {}'.format(stock_max_date_name))
titlee = name + ' ' + title
ax.xaxis_date()
fig.autofmt_xdate()
plt.title(titlee)
plt.ylabel(yylabel)
plt.legend()
plt.savefig(f'plots_stocks/{country}_price_{name}.pdf')
plt.show()
return
# function that plots time series prices with rolling
# it returns the rolled DataSet if return_rolled=True
def plot_price_rolling(date_index, data, day_rolling, tickers, country, title, yylabel='Price', return_rolled=False):
months = pd.to_datetime(date_index).dt.month
year = pd.to_datetime(date_index).dt.year
labels = (months.astype(str) + "-" + year.astype(str))
# data rolling:
data_rolled = pd.DataFrame(index=date_index)
data_rolled = data.rolling(day_rolling).mean().fillna(data_rolled.mean())
for name in tickers:
if name!='Date':
stock_path = data[name]
stock_path_rolled = data_rolled[name]
#plot time series
label_rolled = str(day_rolling) + '-day rolling mean'
fig, ax = plt.subplots()
ax.plot(pd.to_datetime(date_index), stock_path, label='daily', alpha=0.8)
ax.plot(pd.to_datetime(date_index), stock_path_rolled, label=label_rolled)
titlee = name + ' ' + title + 'with rolling window'
ax.xaxis_date()
fig.autofmt_xdate()
plt.title(titlee)
plt.ylabel(yylabel)
plt.legend()
#plt.savefig(f'plots_stocks/{country}_price_roll_{yylabel}_{name}.pdf')
plt.show()
if return_rolled == True:
return data_rolled
return
# function that given a time series retuns its return: r_t% = (X_t-X_t-1)/(X_t-1)*100
# if all=True we change the name of the file to save because it includes all the stocks of that country (needed to build the portfolios)
def returns(names, data, data_index, country, percentage=True, all=False):
if len(names)==1: # only one time series
# do not create a dataset but return directly a list
name = names[0]
if percentage == True:
return (data[name] - data[name].shift(1))/data[name].shift(1)*100
else:
return (data[name] - data[name].shift(1))/data[name].shift(1)
returns = pd.DataFrame()
for name in names:
if name != 'Date':
if percentage == True:
returns[name] = (data[name] - data[name].shift(1))/data[name].shift(1)*100
else:
returns[name] = (data[name] - data[name].shift(1))/data[name].shift(1)
returns['Date'] = data_index.values
returns = returns.set_index('Date')
#returns = returns.drop(returns.index[0]) # drop first row which is NaN
if all==True:
returns.to_csv(f"datasets/{country}_returns_all.csv")
else:
returns.to_csv(f"datasets/{country}_returns.csv")
return returns
# function that gets the 500 companies that composed the SP500 directly from Wikipedia
def get_sp500_stocks_data(index, start, end, read=False):
if read == True:
data = pd.read_csv('datasets/sp500_stocks.csv')
data = data.drop("Unnamed: 0", axis=1)
return data
# current sp500 components (tickers list)
sp_assets = pd.read_html(
'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
assets = sp_assets.Symbol.tolist()
# Download historical data to a multi-index DataFrame
try:
data = yf.download(assets, start=start, end=end, as_panel=False)
filename = 'sp_components_data.pkl'
data.to_pickle(filename)
print('Data saved at {}'.format(filename))
except ValueError:
print('Failed download, try again.')
data = None
# save data
data.to_csv('datasets/sp500_stocks.csv')
return data
# function that creates the SP500 index starting from the 500 prices
def create_SP500_index(df_500_stocks):
data_index = df_500_stocks.index
df_index = pd.DataFrame(index=data_index)
df_index['adj_price'] = df_500_stocks.mean(axis=1)
df_index['return_per'] = returns(['adj_price'], df_index, data_index, 'sp500', percentage=True)
return df_index
# function that creates the SP500 index and merge its to teh returns dataframe
def create_merge_SP500_index(df_500_stocks, df_returns):
df_index_sp500 = create_SP500_index(df_500_stocks)
# merge
df_to_merge = pd.DataFrame(df_index_sp500['return_per']).iloc[1:,:].rename(columns = {0:'sp500'})
df_returns_sp500 = df_returns.merge( df_to_merge , left_index=True, right_index=True)
df_returns_sp500 = df_returns_sp500.rename(columns = {'return_per': 'SP500'})
return df_returns_sp500
# function that plots variance
def plot_variance(names, data, country, title):
for name in names:
df = pd.DataFrame()
df[name] = data[name]
df['Month'] = pd.to_datetime(df.index).to_period("M")
plt.figure()
sns.boxplot(data=df, x="Month", y=name)
plt.xticks(rotation=90)
titlee = 'Variance of ' + title + ' of ' + name
plt.title(titlee)
plt.savefig(f'plots_stocks/{country}_var_{title}_{name}.pdf')
plt.show()
return
# function that plots two rolling times series
# different from plot_price_rolling because that plots the differenc between daily and rolling
def plot_rolling_timeseries(date_index, names, dfs, day_rolling, country, title):
months = pd.to_datetime(date_index).dt.month
year = pd.to_datetime(date_index).dt.year
labels = (months.astype(str) + "-" + year.astype(str))
# data rolled
dfs['open'] = dfs['open'].rolling(day_rolling).mean().fillna(dfs['open'].mean())
dfs['high'] = dfs['high'].rolling(day_rolling).mean().fillna(dfs['high'].mean())
dfs['low'] = dfs['low'].rolling(day_rolling).mean().fillna(dfs['low'].mean())
dfs['close'] = dfs['close'].rolling(day_rolling).mean().fillna(dfs['close'].mean())
for name in names:
if name!='Date':
# plot
label_open = 'Open ' + str(day_rolling) + '-day rolling mean'
label_close = 'Close ' + str(day_rolling) + '-day rolling mean'
label_low = 'Low ' + str(day_rolling) + '-day rolling mean'
label_high = 'high ' + str(day_rolling) + '-day rolling mean'
fig, ax = plt.subplots()
ax.plot(pd.to_datetime(date_index), dfs['open'][name], label=label_open)
ax.plot(pd.to_datetime(date_index), dfs['close'][name], label=label_close)
ax.plot(pd.to_datetime(date_index), dfs['low'][name], label=label_low)
ax.plot(pd.to_datetime(date_index), dfs['high'][name], label=label_high)
titlee = name + ' ' + title + ' with rolling window'
ax.xaxis_date()
fig.autofmt_xdate()
plt.title(titlee)
plt.legend()
plt.savefig(f'plots_stocks/{country}_rolling_timeseries_{name}.pdf')
plt.show()
return
# function that plots two time series to compare some stocks and the sp500 index
def plot_sp500_comparison (df, names, country, title):
for name in names:
y2 = df[name]
y1 = df['SP500']
plt.plot(pd.to_datetime(df.index), y2, label=name, color='b', linewidth=1)
plt.plot(pd.to_datetime(df.index), y1, label='SP500', color='m', linewidth=1)
plt.legend()
titlee = 'Comparinson between ' + title + ' of ' + name + ' vs SP500'
plt.title(titlee)
plt.ylabel('Returns in %')
plt.savefig(f'plots_stocks/{country}_comparison_sp500_{name}.pdf')
plt.show()
return
# function that plots two time series with rolling window to compare some stocks and the sp500 index
def plot_sp500_comparison_rolling (df, names, day_rolling, country, title):
for name in names:
y2 = df[name].rolling(day_rolling).mean()
y1 = df['SP500'].rolling(day_rolling).mean()
label1 = name + ' ' + str(day_rolling) + '-day rolling mean'
label2 = 'SP500 ' + str(day_rolling) + '-day rolling mean'
plt.plot(pd.to_datetime(df.index), y2, label=label1, color='b', linewidth=1)
plt.plot(pd.to_datetime(df.index), y1, label=label2, color='m', linewidth=1)
plt.legend()
titlee = 'Comparinson between ' + title + ' of ' + name + ' vs SP500'
plt.title(titlee)
plt.ylabel('Returns in %')
plt.savefig(f'plots_stocks/{country}_comparison_sp500_roll_{name}.pdf')
plt.show()
return
# function that plots returns in periods of lockdowns
def plot_lockdown_return (df, country, period_lock):
select_rows_dataframe = (df.index >= period_lock[0]) & (df.index <= period_lock[1])
data = df[select_rows_dataframe]
fig, ax = plt.subplots(figsize=(8,5))
titlee = country + ' ' + 'returns % (7 days rolling window)'
plt.title(titlee)
plt.ylabel('Return %')
data.plot(ax=ax)
plt.savefig(f'plots_stocks/returns_lockdowns_{country}.PNG')
plt.show()
return