-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
78 lines (59 loc) · 1.93 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from sklearn import preprocessing
import pandas as pd
import numpy as np
import loadData
#how many days data will be used to create series to train RNN
SERIES_LENGTH=30
PREDICT_LENGTH=7
TICKER="NIFTY_50"
def normalize_data(df):
pass#implement it if you want to use different techniques for normalizing and scaling
def scale_data(df):
for column in df.columns:
df[column] = preprocessing.scale(df[column].values)
return df
def process_data(df):
df["nifty_future_price"]=df[f"{TICKER}_Close"].shift(-PREDICT_LENGTH)
#Dropping any Nan values
df.dropna(inplace=True)
#comparing future nifty price with today's price and labeling it as 1 if price increases and zero otherwise
df["Label"]=np.where(df["nifty_future_price"]>=df["NIFTY_50_Close"],1,0)
#dropping 'nifty_future_price' columns as it is no longer required
df.drop('nifty_future_price',1,inplace=True)
df.to_csv('nifty50_future_label.csv')
sequence=[]
temp=df.loc[:, df.columns != 'Label']
temp=scale_data(temp)
# print(f"temp{temp[:30]}")
for i in range (len(temp)-SERIES_LENGTH):
sequence.append([np.array(temp[i:i+SERIES_LENGTH]),df.iloc[i+SERIES_LENGTH,-1]])
np.random.shuffle(sequence)
X=[]
y=[]
buy=[]
sell=[]
for seq ,label in sequence:
if label == 0:
sell.append([seq,label])
else:
buy.append([seq,label])
# print(f"buy :{buy[:10]}")
# print(f"sell :{sell[:10]}")
buys=len(buy)
sells=len(sell)
# print(f"original buys:{buys} original sells:{sells}")
if(buys<sells):
buy=buy[:buys]
sell=sell[:buys]
else:
buy=buy[:sells]
sell=sell[:sells]
# print(f"buys:{len(buy)} sells:{len(sell)}")
sequence=buy+sell
np.random.shuffle(sequence)
for seq ,label in sequence:
X.append(seq)
y.append(label)
return np.array(X),np.array(y)
# df=loadData.load()
# process_data(df)