-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLoan Prediction code.py
591 lines (491 loc) · 26 KB
/
Loan Prediction code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
#Factors influencing loan prediction
#Salary Loan term
#Previous history EMI
#Loan amount
#LOADING LIBRARIES
import pandas as pd # for working with data
import numpy as np # For mathematical calculations
import seaborn as sns # For data visualization
import matplotlib.pyplot as plt # For plotting graphs
import warnings # for throwing exceptions
warnings.filterwarnings("ignore") # To ignore any warnings
from sklearn.model_selection import train_test_split #To split training data into train and validate
from sklearn.linear_model import LogisticRegression #To build Logistic regression model
from sklearn import tree #To build Decision trees
from sklearn.ensemble import RandomForestClassifier #To build Random forest
from xgboost import XGBClassifier #To build XGB Classifier
from sklearn.metrics import accuracy_score #Accuracy metric
from sklearn.model_selection import StratifiedKFold #To perform Kfold cross validation
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
##########################################################################################################################################################
#Loading Dataset
train=pd.read_csv("Training_dataset.txt")
#Making copies of dataset
train_original=train.copy()
#Data types
print("DATA TYPES OF EACH FEATURE INVOLVED IN DATASET:")
print(train.dtypes)
print("\nLoan_Status is our target variable")
print("\n\nDESCRIPTION ABOUT DATA TYPES INVOLLVED IN DATASET:")
print("object: Instance of a class")
print("int64: It represents the integer variables")
print("float64: It represents the variable which have some decimal values involved")
#Dimensionality of train dataset
print("\n\nDimensionality of train dataset")
print(train.shape)
##########################################################################################################################################################
#UNIVARIATE ANALYSIS- Summarizes individual data; No comparison
print("\n\nUNIVARIATE ANALYSIS")
#Visualizing dependent variable - Loan_Status
print("\nLOAN_STATUS - FREQUENCY")
print(train['Loan_Status'].value_counts())
#To print frequency in probability
print("LOAN_STATUS - PROPORTION")
print(train['Loan_Status'].value_counts(normalize=True))
#Plotting Loan_Status frequency
train['Loan_Status'].value_counts().plot.bar(title = 'Loan_Status')
plt.show()
#Visualizing Categorical independent features
#2X2 graph plot of all categorical features
plt.subplot(221)
train['Gender'].value_counts(normalize=True).plot.bar(title= 'Gender')
plt.subplot(222)
train['Married'].value_counts(normalize=True).plot.bar(title= 'Married')
plt.subplot(223)
train['Self_Employed'].value_counts(normalize=True).plot.bar(title= 'Self_Employed')
plt.subplot(224)
train['Credit_History'].value_counts(normalize=True).plot.bar(title= 'Credit_History')
#Displaying graph
plt.show()
#Visualizing Ordinal independent features
#ORDINAL FEATURES-> Categorial features that can be arranged in some order/hierarchy
#1X3 graph plot of all categorical features
plt.subplot(131)
train['Dependents'].value_counts(normalize=True).plot.bar(title= 'Dependents')
plt.subplot(132)
train['Education'].value_counts(normalize=True).plot.bar(title= 'Education')
plt.subplot(133)
train['Property_Area'].value_counts(normalize=True).plot.bar(title= 'Property_Area')
#Displaying graph
plt.show()
#Visualizing Numerical independent features
#1X2 graph plot of Applicant Income
plt.figure(1)
plt.subplot(121)
sns.distplot(train['ApplicantIncome'])
plt.subplot(122)
train.boxplot(column = 'ApplicantIncome')
plt.show()
#Most of the data in the distribution of applicant income is towards left - Not normally distributed
#Boxplot confirms the presence of a lot of outliers/extreme values - Income disparity among social classes
#Boxplot of Applicant Income segregated based on "Education"
train.boxplot(column = 'ApplicantIncome' , by = 'Education')
plt.show()
#1X2 graph plot of Coapplicant Income
plt.figure(1)
plt.subplot(121)
sns.distplot(train['CoapplicantIncome'])
plt.subplot(122)
train.boxplot(column = 'CoapplicantIncome')
plt.show()
#Most of the data in the distribution of applicant income is towards left - Not normally distributed
#Boxplot confirms the presence of a lot of outliers/extreme values
#Since we don't know education status of coapplicant, we stop here
#1X2 graph plot of Loan Amount
plt.figure(1)
plt.subplot(121)
#Removing NaN values
df = train.dropna()
sns.distplot(df['LoanAmount'])
plt.subplot(122)
train.boxplot(column = 'LoanAmount')
plt.show()
#Distribution is fairly normal
#Lots of outliers here too
#OUTLIERS-> GREATER THAN UPPER INNER FENCE /SMALLER THAN LOWER INNER FENCE
##############################################################################################################################################
#BIVARIATE ANALYSIS-> Comparison between 2 variables; dependent and non-independent
print("\n\n\n\nBIVARIATE ANALYSIS")
#Categorical Independent Variable vs Target Variable
print("\n\nCategorical Independent Variable vs Target Variable")
#Split up of Loan_Status based on Gender
Gender_crosstab = pd.crosstab(train['Gender'],train['Loan_Status'])
print("\nSplit up of Loan_Status based on Gender(Frequency)")
print(Gender_crosstab)
#Finding probability split up of Loan_Status based on Gender
print("\nSplit up of Loan_Status based on Gender(Proportion)")
#Axis 1 => Y/N ; Axis 0 => Male/Female
#Getting probability of each loan status for each gender
Gender_prob = Gender_crosstab.div(Gender_crosstab.sum(axis = 1).astype(float), axis=0)
print(Gender_prob)
#Plotting Graph
Gender_crosstab.div(Gender_crosstab.sum(axis = 1).astype(float), axis=0).plot.bar(stacked = True , title='Gender Status vs Loan Status')
plt.show()
#Split up of Loan_Status based on Married or not
Married_crosstab = pd.crosstab(train['Married'],train['Loan_Status'])
print("\n\nSplit up of Loan_Status based on Married or not(Frequency)")
print(Married_crosstab)
#Finding probability split up of Loan_Status based on Married or not
print("\nSplit up of Loan_Status based on Married or not(Proportion)")
#Axis 1 => Y/N ; Axis 0 => Married/Unmarried
#Getting probability of each loan status for each marriage status
Married_prob = Married_crosstab.div(Married_crosstab.sum(axis = 1).astype(float), axis=0)
print(Married_prob)
#Plotting Graph
Married_crosstab.div(Married_crosstab.sum(axis = 1).astype(float), axis=0).plot.bar(stacked = True , title='Married Status vs Loan Status')
plt.show()
#Split up of Loan_Status based on Education
Education_crosstab = pd.crosstab(train['Education'],train['Loan_Status'])
print("\n\nSplit up of Loan_Status based on Education(Frequency)")
print(Education_crosstab)
#Finding probability split up of Loan_Status based on Education
print("\nSplit up of Loan_Status based on Education(Proportion)")
#Axis 1 => Y/N ; Axis 0 => Graduated/Ungraduated
#Getting probability of each loan status for each education status
Education_prob = Education_crosstab.div(Education_crosstab.sum(axis = 1).astype(float), axis=0)
print(Education_prob)
#Plotting Graph
Education_crosstab.div(Education_crosstab.sum(axis = 1).astype(float), axis=0).plot.bar(stacked = True , title='Education Status vs Loan Status')
plt.show()
#Split up of Loan_Status based on Selfemployed or not
Selfemployed_crosstab = pd.crosstab(train['Self_Employed'],train['Loan_Status'])
print("\n\nSplit up of Loan_Status based on Selfemployed or not(Frequency)")
print(Selfemployed_crosstab)
#Finding probability split up of Loan_Status based on Selfemployed or not
print("\nSplit up of Loan_Status based on Self employed or Not(Proportion)")
#Axis 1 => Y/N ; Axis 0 => Selfemployed/Not Selfemployed
#Getting probability of each loan status for each self employment status
Selfemployed_prob = Selfemployed_crosstab.div(Selfemployed_crosstab.sum(axis = 1).astype(float), axis=0)
print(Selfemployed_prob)
#Plotting Graph
Selfemployed_crosstab.div(Selfemployed_crosstab.sum(axis = 1).astype(float), axis=0).plot.bar(stacked = True , title='Selfemployed Status vs Loan Status')
plt.show()
#Split up of Loan_Status based on Number of dependents
Dependents_crosstab = pd.crosstab(train['Dependents'],train['Loan_Status'])
print("\n\nSplit up of Loan_Status based on number of Dependents(Frequency)")
print(Dependents_crosstab)
#Finding probability split up of Loan_Status based on number of Dependents
print("\nSplit up of Loan_Status based on number of Dependents(Proportion)")
#Axis 1 => Y/N ; Axis 0 => Number of Dependents
#Getting probability of each loan status for each number of dependents
Dependents_prob = Dependents_crosstab.div(Dependents_crosstab.sum(axis = 1).astype(float), axis=0)
print(Dependents_prob)
#Plotting Graph
Dependents_crosstab.div(Dependents_crosstab.sum(axis = 1).astype(float), axis=0).plot.bar(stacked = True , title='Dependents Status vs Loan Status')
plt.show()
#Split up of Loan_Status based on Credit History
Credit_History_crosstab = pd.crosstab(train['Credit_History'],train['Loan_Status'])
print("\n\nSplit up of Loan_Status based on Credit History(Frequency)")
print(Credit_History_crosstab)
#Finding probability split up of Loan_Status based on Credit History
print("\nSplit up of Loan_Status based on Credit History(Proportion)")
#Axis 1 => Y/N ; Axis 0 => Credit History
#Getting probability of each loan status for each credit history status
Credit_History_prob = Credit_History_crosstab.div(Credit_History_crosstab.sum(axis = 1).astype(float), axis=0)
print(Credit_History_prob)
#Plotting Graph
Credit_History_crosstab.div(Credit_History_crosstab.sum(axis = 1).astype(float), axis=0).plot.bar(stacked = True , title='Credit History Status vs Loan Status')
plt.show()
#Split up of Loan_Status based on Property Area
Property_Area_crosstab = pd.crosstab(train['Property_Area'],train['Loan_Status'])
print("\n\nSplit up of Loan_Status based on Property Area(Frequency)")
print(Property_Area_crosstab)
#Finding probability split up of Loan_Status based on Property Area
print("\nSplit up of Loan_Status based on Property Area(Proportion)")
#Axis 1 => Y/N ; Axis 0 => Property_Area
#Getting probability of each loan status for each proprty area category
Property_Area_prob = Property_Area_crosstab.div(Property_Area_crosstab.sum(axis = 1).astype(float), axis=0)
print(Property_Area_prob)
#Plotting Graph
Property_Area_crosstab.div(Property_Area_crosstab.sum(axis = 1).astype(float), axis=0).plot.bar(stacked = True , title='Property Area Status vs Loan Status')
plt.show()
#Numerical Independent Variable vs Target Variable
print("\n\nNumerical Independent Variable vs Target Variable")
#Split up of Loan_Status based on Income
#Plotting mean income of acccepted and non-accepted loans
train.groupby('Loan_Status')['ApplicantIncome'].mean().plot.bar(title = "Applicant's Income")
plt.show()
print("\nFrom Appliant Income vs Loan Status, we cannot come to a conclusion")
print("So, we create several bins and try comparing the loan status")
#Specifying limits for bins
bins=[0,2500,4000,6000,81000]
group=['Low','Average','High', 'Very high']
#Segments data into bins
print("Samples of Income after segregating into different bins")
train['Income_bin']=pd.cut(train['ApplicantIncome'],bins,labels=group)
print(train.loc[:10,['Income_bin','ApplicantIncome']])
#Split up of Loan_Status based on Income(Bins)
Income_bin_crosstab=pd.crosstab(train['Income_bin'],train['Loan_Status'])
print("\nSplit up of Loan_Status based on Income(Frequency)")
print(Income_bin_crosstab)
#Finding probability split up of Loan_Status based on Income
print("\nSplit up of Loan_Status based on Income(Proportion)")
#Axis 1 => Y/N ; Axis 0 => Income_bin
#Getting probability of each loan status for each Income bin category
Income_prob = Income_bin_crosstab.div(Income_bin_crosstab.sum(axis = 1).astype(float), axis=0)
print(Income_prob)
#Plotting graph
Income_bin_crosstab.div(Income_bin_crosstab.sum(axis = 1).astype(float), axis=0).plot.bar(stacked = True , title='Income Status vs Loan Status')
plt.show()
print("It can be inferred that Applicant income does not affect the chances of loan approval")
#Split up of Loan_Status based on Co-Applicant Income
#Specifying limits for bins
bins=[0,1000,3000,42000]
group=['Low','Average','High']
#Segments data into bins
print("Samples of Co-Applicant Income after segregating into different bins")
train['Coapplicant_Income_bin']=pd.cut(train['CoapplicantIncome'],bins,labels=group)
print(train.loc[:10,['Coapplicant_Income_bin','CoapplicantIncome']])
#Split up of Loan_Status based on Co-Applicant Income(Bins)
Coapplicant_Income_bin_crosstab=pd.crosstab(train['Coapplicant_Income_bin'],train['Loan_Status'])
print("\nSplit up of Loan_Status based on Coapplicant Income")
print(Coapplicant_Income_bin_crosstab)
#Finding probability split up of Loan_Status based on Income
print("\nSplit up of Loan_Status based on Coapplicant Income - Probability")
#Axis 1 => Y/N ; Axis 0 => Coapplicant Income
#Getting probability of each loan status for each Co-applicant bin category
Coapplicant_Income_prob = Coapplicant_Income_bin_crosstab.div(Coapplicant_Income_bin_crosstab.sum(axis = 1).astype(float), axis=0)
print(Coapplicant_Income_prob)
#Plotting graph
Coapplicant_Income_bin_crosstab.div(Coapplicant_Income_bin_crosstab.sum(axis = 1).astype(float), axis=0).plot.bar(stacked = True , title='Coapplicant Income Status vs Loan Status')
plt.show()
print("It can be inferred that if coapplicant’s income is less the chances of loan approval are high")
print("""But this does not look right.
The possible reason behind this may be that most of the applicants don’t have any coapplicant.
So, the coapplicant income for such applicants is 0 and hence the loan approval is not dependent on it.
So, we will combine the applicant’s and coapplicant’s income to visualize the combined effect of income on loan approval
(Nan for some records represents that those applicants don’t have any coapplicant).""")
#Split up of Loan_Status based on Total Income (Applicant and Co-Applicant)
#Combining Applicant and Co-Applicant's income
train['Total_Income']=train['ApplicantIncome']+train['CoapplicantIncome']
#Specifying limits for bins
bins=[0,2500,4000,6000,81000]
group=['Low','Average','High', 'Very high']
#Segments data into bins
print("Samples of Total Income (Applicant and Co-Applicant) after segregating into different bins")
train['Total_Income_bin']=pd.cut(train['Total_Income'],bins,labels=group)
print(train.loc[:10,['Total_Income_bin','Total_Income']])
#Split up of Loan_Status based on Total Income(Bins)
Total_Income_bin_crosstab=pd.crosstab(train['Total_Income_bin'],train['Loan_Status'])
print("\nSplit up of Loan_Status based on Total Income (Applicant and Co-Applicant)")
print(Total_Income_bin_crosstab)
#Finding probability split up of Loan_Status based on Total Income
print("\nSplit up of Loan_Status based on Coapplicant Income (Applicant and Co-Applicant) - Probability")
#Axis 1 => Y/N ; Axis 0 => Total Income
#Getting probability of each loan status for each total applicant bin category
Total_Income_prob = Total_Income_bin_crosstab.div(Total_Income_bin_crosstab.sum(axis = 1).astype(float), axis=0)
print(Total_Income_prob)
#Plotting graph
Total_Income_bin_crosstab.div(Total_Income_bin_crosstab.sum(axis = 1).astype(float), axis=0).plot.bar(stacked = True , title='Total Income Status vs Loan Status')
plt.show()
print("""It can be inferred that proportion of loans getting approved for applicants having low Total Income is the least.""")
#Split up of Loan_Status based on Loan Amount
#Specifying limits for bins
bins=[0,100,200,700]
group=['Low','Average','High']
#Segments data into bins
print("Samples of Loan amount after segregating into different bins")
train['LoanAmount_bin']=pd.cut(train['LoanAmount'],bins,labels=group)
print(train.loc[:10,['LoanAmount_bin','LoanAmount']])
#Split up of Loan_Status based on LoanAmount(Bins)
LoanAmount_bin_crosstab=pd.crosstab(train['LoanAmount_bin'],train['Loan_Status'])
print("\nSplit up of Loan_Status based on LoanAmount")
print(LoanAmount_bin_crosstab)
#Finding probability split up of Loan_Status based on LoanAmount
print("\nSplit up of Loan_Status based on LoanAmount - Probability")
#Axis 1 => Y/N ; Axis 0 => LoanAmount
#Getting probability of each loan status for each Loan Amount category
LoanAmount_prob = LoanAmount_bin_crosstab.div(LoanAmount_bin_crosstab.sum(axis = 1).astype(float), axis=0)
print(LoanAmount_prob)
#Plotting graph
LoanAmount_bin_crosstab.div(LoanAmount_bin_crosstab.sum(axis = 1).astype(float), axis=0).plot.bar(stacked = True , title='LoanAmount Status vs Loan Status')
plt.show()
print("It can be seen that the proportion of approved loans is higher for Low and Average Loan Amount as compared to that of High Loan Amount")
#Heat Map visulaization
train=train.drop(['Income_bin', 'Coapplicant_Income_bin', 'LoanAmount_bin', 'Total_Income_bin', 'Total_Income'], axis=1)
#Changing 3+ dependents to 3
train['Dependents'].replace('3+', 3,inplace=True)
#Changing categorical dependent variable into numerical
train['Loan_Status'].replace('N', 0,inplace=True)
train['Loan_Status'].replace('Y', 1,inplace=True)
#Correlation indicates the extent to which two or more variables fluctuate together.
#Correlation found only between numerical columns in training dataset
matrix = train.corr()
#f, ax = plt.subplots(figsize=(9, 6))
#Plotting heatmap for the correlation matrix found
#Anything above 0.8 will be given max colour depth
sns.heatmap(matrix, vmax=.8, square=True, cmap="Blues");
plt.show()
print("""We see that the most correlated variables are
1. ApplicantIncome - LoanAmount
2. Credit_History - Loan_Status""")
##############################################################################################################################################
#Imputing missing values
#Finding number of missing values in each feature
print("\nNumber of null values in each field")
print(train.isnull().sum())
#Computing missing values for CATEGORICAL FEATURES by finding mode
train['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
train['Married'].fillna(train['Married'].mode()[0], inplace=True)
train['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)
train['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
train['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True)
train['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True)
#Computing missing values for CONTINUOUS FEATURES by finding median
train['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True)
#Removing skewness by taking log transformation
#Log-> has more effect on larger values
train['LoanAmount_log'] = np.log(train['LoanAmount'])
##############################################################################################################################################
#Building Logistic regression model
#Dropping ID
train=train.drop('Loan_ID', axis=1)
#Splitting train dataset into dependent and independent features
X = train.drop('Loan_Status', axis=1)
y = train.Loan_Status
#Performing One-hot encoding for CATEGORICAL features
X=pd.get_dummies(X)
print("\n\nTraining data after performing One-hot encoding")
train=pd.get_dummies(train)
print(type(train))
print(train[['Property_Area_Rural', 'Property_Area_Semiurban', 'Property_Area_Urban', 'Gender_Male', 'Gender_Female']])
print("\n\nDummy columns that are automatically generated for the purpose of One hot encoding")
print(train.columns)
#Splitting training data for training and validation
#70% data for training ; 30% data for validation
x_train, x_cv, y_train, y_cv = train_test_split(X,y, test_size =0.3)
#Building model
model = LogisticRegression(random_state = 1)
#Fitting data
model.fit(x_train, y_train)
#Predicting validation data's output using model
pred_cv = model.predict(x_cv)
#Calculating accuracy of predictions made on validation data
print("\n\nAccuracy of the Logistic Regression model built is")
print(accuracy_score(y_cv,pred_cv))
##############################################################################################################################################
#Validating models using Stratified K-fold validation method
#Logistic Regression - Stratified k-folds cross Validation
print("\n\nLogistic Regression - Stratified k-folds cross Validation")
tot_acc = 0
#To make count of iterations
i=1
#Data is split into 5 folds
kf = StratifiedKFold(n_splits=5,random_state=1,shuffle=True)
# Indices segregated for training and validation are retrieved using variable as lists
for train_indices,test_indices in kf.split(X,y):
print('\nIteration {} of kfold {}'.format(i,kf.n_splits))
#Splitting training data for training and validation based on indices segregated by KF
xtr,xvl = X.iloc[train_indices],X.iloc[test_indices]
ytr,yvl = y.iloc[train_indices],y.iloc[test_indices]
#Building Logistic Regression model
model = LogisticRegression(random_state=1)
#Fitting data
model.fit(xtr, ytr)
#Predicting validation data's output using model
pred_test = model.predict(xvl)
#Calculating accuracy of predictions made on validation data
score = accuracy_score(yvl,pred_test)
print('accuracy_score',score)
i+=1
#For the purpose of finding mean Accuracy
tot_acc = tot_acc + score
LR_mean_acc = tot_acc/5
print("\nMean validation accuracy for Logistic Regression - Stratified k-folds cross Validation model is {}" .format(LR_mean_acc))
#Decision Tree - Stratified k-folds cross Validation
print("\n\nDecision Tree - Stratified k-folds cross Validation")
tot_acc = 0
#To make count of iterations
i=1
#Data is split into 5 folds
kf = StratifiedKFold(n_splits=5,random_state=1,shuffle=True)
# Indices segregated for training and validation are retrieved using variable as lists
for train_indices,test_indices in kf.split(X,y):
print('\nIteration {} of kfold {}'.format(i,kf.n_splits))
#Splitting training data for training and validation based on indices segregated by KF
xtr,xvl = X.loc[train_indices],X.loc[test_indices]
ytr,yvl = y[train_indices],y[test_indices]
#Building Decision tree model
model = tree.DecisionTreeClassifier(random_state=1)
#Fitting data
model.fit(xtr, ytr)
#Predicting validation data's output using model
pred_test = model.predict(xvl)
#Calculating accuracy of predictions made on validation data
score = accuracy_score(yvl,pred_test)
print('accuracy_score',score)
i+=1
#For the purpose of finding mean Accuracy
tot_acc = tot_acc + score
DT_mean_acc = tot_acc/5
print("\nMean validation accuracy for Decision Tree - Stratified k-folds cross Validation model is {}" .format(DT_mean_acc))
#Random Forest - Stratified k-folds cross Validation
print("\n\nRandom Forest - Stratified k-folds cross Validation")
tot_acc = 0
#To make count of iterations
i=1
#Data is split into 5 folds
kf = StratifiedKFold(n_splits=5,random_state=1,shuffle=True)
# Indices segregated for training and validation are retrieved using variable as lists
for train_indices,test_indices in kf.split(X,y):
print('\nIteration {} of kfold {}'.format(i,kf.n_splits))
#Splitting training data for training and validation based on indices segregated by KF
xtr,xvl = X.loc[train_indices],X.loc[test_indices]
ytr,yvl = y[train_indices],y[test_indices]
#Building Decision tree model
model = RandomForestClassifier(random_state=1, max_depth=10)
#Fitting data
model.fit(xtr, ytr)
#Predicting validation data's output using model
pred_test = model.predict(xvl)
#Calculating accuracy of predictions made on validation data
score = accuracy_score(yvl,pred_test)
print('accuracy_score',score)
i+=1
#For the purpose of finding mean Accuracy
tot_acc = tot_acc + score
RF_mean_acc = tot_acc/5
print("\nMean validation accuracy for Random Forest - Stratified k-folds cross Validation model is {}" .format(RF_mean_acc))
#XGBoost - Stratified k-folds cross Validation
print("\n\nXGBoost - Stratified k-folds cross Validation")
tot_acc = 0
#To make count of iterations
i=1
#Data is split into 5 folds
kf = StratifiedKFold(n_splits=5,random_state=1,shuffle=True)
# Indices segregated for training and validation are retrieved using variable as lists
for train_indices,test_indices in kf.split(X,y):
print('\nIteration {} of kfold {}'.format(i,kf.n_splits))
#Splitting training data for training and validation based on indices segregated by KF
xtr,xvl = X.loc[train_indices],X.loc[test_indices]
ytr,yvl = y[train_indices],y[test_indices]
#Building Decision tree model
model = XGBClassifier(random_state=1, n_estimators=50)
#Fitting data
model.fit(xtr, ytr)
#Predicting validation data's output using model
pred_test = model.predict(xvl)
#Calculating accuracy of predictions made on validation data
score = accuracy_score(yvl,pred_test)
print('accuracy_score',score)
i+=1
#For the purpose of finding mean Accuracy
tot_acc = tot_acc + score
XGB_mean_acc = tot_acc/5
print("\nMean validation accuracy for XGBoost - Stratified k-folds cross Validation model is {}" .format(XGB_mean_acc))
##############################################################################################################################################
#Comparison of accuracies of four algorithms
print("\n\nMean validation accuracies of four algorithms are listed below")
print("Logistic Regression : " +str(LR_mean_acc))
print("Decision Trees : " +str(DT_mean_acc))
print("Random Forest : " +str(RF_mean_acc))
print("XGBoosting : " +str(XGB_mean_acc))
#Plotting graph for the comparison
Accuracies = {'Logistic Regression' : LR_mean_acc, 'Decision Trees' : DT_mean_acc, 'Random Forest' : RF_mean_acc, 'XGBoosting' : XGB_mean_acc}
plt.bar(x = range(len(Accuracies)), height = list(Accuracies.values()), align = 'center')
plt.xticks(range(len(Accuracies)), list(Accuracies.keys()))
plt.yticks(np.arange(0, 1, 0.05))
plt.title('Accuracy Comparison')
plt.show()