+ + + + + + diff --git a/Loan_Approval_Prediction.py b/Loan_Approval_Prediction.py new file mode 100644 index 0000000..9da6393 --- /dev/null +++ b/Loan_Approval_Prediction.py @@ -0,0 +1,1149 @@ +#!/usr/bin/env python +# coding: utf-8 + +# # Problem Statement: Loan Approval Prediction Problem +# Type: Binary Classification +# Loan approval prediction is classic problem to learn and apply lots of data analysis techniques to +# create best Classification model. +# +# Given with the dataset consisting of details of applicants for loan and status whether the loan application is approved or not. +# Basis on the a binary classification model is to be created with maximum accuracy. + +# In[1]: + + +#Basic and most important libraries +import pandas as pd , numpy as np +from sklearn.utils import resample +from sklearn.preprocessing import StandardScaler , MinMaxScaler +from collections import Counter +from scipy import stats +import matplotlib.pyplot as plt +import seaborn as sns +import plotly.express as px +import plotly.figure_factory as ff +import plotly + +#Classifiers +from sklearn.ensemble import AdaBoostClassifier , GradientBoostingClassifier , VotingClassifier , RandomForestClassifier +from sklearn.linear_model import LogisticRegression , RidgeClassifier +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +from sklearn.model_selection import RepeatedStratifiedKFold +from sklearn.neighbors import KNeighborsClassifier +from sklearn.model_selection import GridSearchCV +from sklearn.tree import DecisionTreeClassifier +from sklearn.naive_bayes import GaussianNB +from xgboost import plot_importance +from xgboost import XGBClassifier +from sklearn.svm import SVC + +#Model evaluation tools +from sklearn.metrics import classification_report , accuracy_score , confusion_matrix +from sklearn.metrics import accuracy_score,f1_score +from sklearn.model_selection import cross_val_score + +#Data processing functions +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import train_test_split +from sklearn import model_selection +from sklearn.preprocessing import LabelEncoder +le = LabelEncoder() + +import warnings +warnings.filterwarnings("ignore") + + +# In[2]: + + +data = pd.read_csv(r"C:\Master\Learning\Analytics_Vidhya\Loan_Prediction-Hackathon\train.csv") +data.head(5) + + +# In[3]: + + +sns.pairplot(data) +plt.show() + + +# In[4]: + + +data.describe() + + +# In[5]: + + +data.info() + + +# In[6]: + + + +fig = px.scatter_matrix(data["ApplicantIncome"]) +fig.update_layout(width=700,height=400) +fig.show() + + +# Seems need to work on data preperation +# +# -Loan Amount column does is not fit in Normal Distribution +# +# -Outliers in Applicant's Income and Co-applicant's income +# + +# In[7]: + + +data.isnull().sum() + + +# In[8]: + + +plt.figure(figsize=(10,6)) +sns.heatmap(data.isnull()) + + +# # Normal Distribution +# Central limit theorem +# In simple language we can say that maximum amount of data / or maximum number of data points are near the Mean of the all +# data points. +# +# To validate he normal distribution of the data:- +# Mean Mode Median are Equal.\n +# +# We can gen identified the distribution of entire data with the help of Mean and Standard Deviation. +# +# When the data is normally distributed maximum data is centralized near the mean value of the data. +# +# To get understanding of distribtuion we can simply plot Distribution plot i.e. Simple Histogram. +# +# Normally Distributed data represents a Bell Shaped curve. +# +# Also Mean , Mode , Median on Normaly Distributed data are equal (Mean=Mode=Median) +# +# One more method is to calculate mean which should be 0 or near to 0 and Standard deviation 1 or near 1. +# +# Mean = sum(All Data Points)/count(Data Points) +# +# Standard Deviation = Root of { sum [Square (each data point - mean of whole data) ] } +# + +# In[9]: + + +#Checking if the non-categorical variables are Normally Distributed or Not. i.e. Checking outliers... + +print("Data distribution analysis:->---------------------------------------\n") +print("\nMean:->\n") +print("ApplicantIncome: ",np.mean(data["ApplicantIncome"])) +print("CoapplicantIncome: ",np.mean(data["CoapplicantIncome"])) +print("LoanAmount: ",np.mean(data["LoanAmount"])) + +print("\nMode:->\n") +print("ApplicantIncome: ",stats.mode(data["ApplicantIncome"])[0]) +print("CoapplicantIncome: ",stats.mode(data["CoapplicantIncome"])[0]) +print("LoanAmount: ",stats.mode(data["LoanAmount"])[0]) + +print("\nMedian:->\n") +print("ApplicantIncome: ",np.median(data["ApplicantIncome"])) +print("CoapplicantIncome: ",np.median(data["CoapplicantIncome"])) +print("LoanAmount: ",np.median(data["LoanAmount"])) + +print("\nStandard Deviation:->\n") +print("ApplicantIncome: ",np.std(data["ApplicantIncome"])) +print("CoapplicantIncome: ",np.std(data["CoapplicantIncome"])) +print("LoanAmount: ",np.std(data["LoanAmount"])) + +fig = px.histogram(data["ApplicantIncome"],x ="ApplicantIncome" ,y = "ApplicantIncome" ) +fig.update_layout(title="ApplicantIncome") +fig.show() + +fig = px.histogram(data["CoapplicantIncome"],x ="CoapplicantIncome" ,y = "CoapplicantIncome" ) +fig.update_layout(title="CoapplicantIncome") +fig.show() + +fig = px.histogram(data["LoanAmount"],x ="LoanAmount" ,y = "LoanAmount" ) +fig.update_layout(title="LoanAmount") +fig.show() + + +# From above graphs found these variables are not normaly distributed. +# +# Foud right-skewed distribution in these three variabels. +# + +# In[10]: + + +plt.figure(figsize=(10,5)) +fig = px.bar(data,x=data["Gender"]) +fig.show() + +fig = px.bar(data,x=data["Married"]) +fig.show() + +fig = px.bar(data,x=data["Education"],color="Education") +fig.show() + +fig = px.bar(data,x=data["Self_Employed"]) +fig.show() + +fig = px.bar(data,x=data["Dependents"]) +fig.show() + +fig = px.bar(data,x=data["Property_Area"]) +fig.show() + +fig = px.bar(data,x=data["Loan_Status"],color="Loan_Status") +fig.show() + + +# Prepare data for model training i.e. removing ouliers , filling null values , removing skewness + +# In[11]: + + +print(data["Gender"].value_counts()) +print(data["Married"].value_counts()) +print(data["Self_Employed"].value_counts()) +print(data["Dependents"].value_counts()) +print(data["Credit_History"].value_counts()) +print(data["Loan_Amount_Term"].value_counts()) + + +# ->Taking mode of values in a column will be best way to fill null values. +# ->Not mean because values are not ordinal but are categorical. + +# In[12]: + + +#Filling all Nan values with mode of respective variable +data["Gender"].fillna(data["Gender"].mode()[0],inplace=True) +data["Married"].fillna(data["Married"].mode()[0],inplace=True) +data["Self_Employed"].fillna(data["Self_Employed"].mode()[0],inplace=True) +data["Loan_Amount_Term"].fillna(data["Loan_Amount_Term"].mode()[0],inplace=True) +data["Dependents"].fillna(data["Dependents"].mode()[0],inplace=True) +data["Credit_History"].fillna(data["Credit_History"].mode()[0],inplace=True) + +#All values of "Dependents" columns were of "str" form now converting to "int" form. +data["Dependents"] = data["Dependents"].replace('3+',int(3)) +data["Dependents"] = data["Dependents"].replace('1',int(1)) +data["Dependents"] = data["Dependents"].replace('2',int(2)) +data["Dependents"] = data["Dependents"].replace('0',int(0)) + +data["LoanAmount"].fillna(data["LoanAmount"].median(),inplace=True) + +print(data.isnull().sum()) + +#Heat map for null values +plt.figure(figsize=(10,6)) +sns.heatmap(data.isnull()) + + +# In[13]: + + +#Treating outliers and Converting data to Normal Distribution +#Before removing outlier + +print("\nMean:->\n") +print("ApplicantIncome: ",np.mean(data["ApplicantIncome"])) +print("CoapplicantIncome: ",np.mean(data["CoapplicantIncome"])) +print("LoanAmount: ",np.mean(data["LoanAmount"])) + +print("\nMode:->\n") +print("ApplicantIncome: ",stats.mode(data["ApplicantIncome"])[0]) +print("CoapplicantIncome: ",stats.mode(data["CoapplicantIncome"])[0]) +print("LoanAmount: ",stats.mode(data["LoanAmount"])[0]) + +print("\nMedian:->\n") +print("ApplicantIncome: ",np.median(data["ApplicantIncome"])) +print("CoapplicantIncome: ",np.median(data["CoapplicantIncome"])) +print("LoanAmount: ",np.median(data["LoanAmount"])) + +print("\nStandard Deviation:->\n") +print("ApplicantIncome: ",np.std(data["ApplicantIncome"])) +print("CoapplicantIncome: ",np.std(data["CoapplicantIncome"])) +print("LoanAmount: ",np.std(data["LoanAmount"])) + +fig = px.histogram(data["ApplicantIncome"],x ="ApplicantIncome" ,y = "ApplicantIncome" ) +fig.update_layout(title="ApplicantIncome") +fig.show() + +fig = px.histogram(data["CoapplicantIncome"],x ="CoapplicantIncome" ,y = "CoapplicantIncome" ) +fig.update_layout(title="CoapplicantIncome") +fig.show() + +fig = px.histogram(data["LoanAmount"],x ="LoanAmount" ,y = "LoanAmount" ) +fig.update_layout(title="LoanAmount") +fig.show() + +#################################################################################################### +#Getting log value :-> + +data["ApplicantIncome"] = np.log(data["ApplicantIncome"]) +#As "CoapplicantIncome" columns has some "0" values we will get log values except "0" +data["CoapplicantIncome"] = [np.log(i) if i!=0 else 0 for i in data["CoapplicantIncome"]] +data["LoanAmount"] = np.log(data["LoanAmount"]) +#################################################################################################### + +print("---------------------------After converting to Normal Distributed data----------------------") + +print("\nMean:->\n") +print("ApplicantIncome: ",np.mean(data["ApplicantIncome"])) +print("CoapplicantIncome: ",np.mean(data["CoapplicantIncome"])) +print("LoanAmount: ",np.mean(data["LoanAmount"])) + +print("\nMode:->\n") +print("ApplicantIncome: ",stats.mode(data["ApplicantIncome"])[0]) +print("CoapplicantIncome: ",stats.mode(data["CoapplicantIncome"])[0]) +print("LoanAmount: ",stats.mode(data["LoanAmount"])[0]) + +print("\nMedian:->\n") +print("ApplicantIncome: ",np.median(data["ApplicantIncome"])) +print("CoapplicantIncome: ",np.median(data["CoapplicantIncome"])) +print("LoanAmount: ",np.median(data["LoanAmount"])) + +print("\nStandard Deviation:->\n") +print("ApplicantIncome: ",np.std(data["ApplicantIncome"])) +print("CoapplicantIncome: ",np.std(data["CoapplicantIncome"])) +print("LoanAmount: ",np.std(data["LoanAmount"])) + +plt.figure(figsize=(10,4)) +fig = px.histogram(data["ApplicantIncome"],x ="ApplicantIncome" ,y = "ApplicantIncome" ) +fig.update_layout(title="ApplicantIncome") +fig.show() + +fig = px.histogram(data["CoapplicantIncome"],x ="CoapplicantIncome" ,y = "CoapplicantIncome" ) +fig.update_layout(title="CoapplicantIncome") +fig.show() + +fig = px.histogram(data["LoanAmount"],x ="LoanAmount" ,y = "LoanAmount" ) +fig.update_layout(title="LoanAmount") +fig.show() + + +# Now we can see that Bell Curve for all three variables and data is normally distributed now. + +# In[14]: + + +data.head(5) + + +# In[15]: + + +data["Gender"] = le.fit_transform(data["Gender"]) +data["Married"] = le.fit_transform(data["Married"]) +data["Education"] = le.fit_transform(data["Education"]) +data["Self_Employed"] = le.fit_transform(data["Self_Employed"]) +data["Property_Area"] = le.fit_transform(data["Property_Area"]) +data["Loan_Status"] = le.fit_transform(data["Loan_Status"]) + +#data = pd.get_dummies(data) +data.head(5) + + +# In[ ]: + + + + + +# In[ ]: + + + + + +# # Feature Importance +# +# In order to create best predictive model we need to best understand the available data and +# get most information from the data. +# +# In multivariate data it is important to understand the iortance of varialbes and +# how much they are contributing towards the target variable. Such that we can remove unnecessary variables to increase +# model performance. +# +# Many times dataset consists of exta columns which do not identically serve information to classify the data. +# This leads in Wrong Assumption of model while training. +# +# To understand the importance of the data we are going to use Machine Learning classifiers and +# then will plot bar graph based on importance. +# +# Also XGBoost has built-in Feature Importance Plotting tool which we are going to use. +# +# Using more than one classifier will increase the confidence on our assumption of which variables to keep +# and which to remove. + +# In[16]: + + +#Dividing data into Input X variables and Target Y variable +X = data.drop(["Loan_Status","Loan_ID"],axis=1) +y = data["Loan_Status"] + + +# In[17]: + + +print("Feature importance by XGBoost:->\n") +XGBR = XGBClassifier() +XGBR.fit(X,y) +features = XGBR.feature_importances_ +Columns = list(X.columns) +for i,j in enumerate(features): + print(Columns[i],"->",j) +plt.figure(figsize=(16,5)) +plt.title(label="XGBC") +plt.bar([x for x in range(len(features))],features) +plt.show() + +plot_importance(XGBR) + +print("Feature importance by Random Forest:->\n") +RF = RandomForestClassifier() +RF.fit(X,y) +features = RF.feature_importances_ +Columns = list(X.columns) +for i,j in enumerate(features): + print(Columns[i],"->",j) +plt.figure(figsize=(16,5)) +plt.title(label="RF") +plt.bar([x for x in range(len(features))],features) +plt.show() + +print("Feature importance by Decision Tree:->\n") +DT = DecisionTreeClassifier() +DT.fit(X,y) +features = DT.feature_importances_ +Columns = list(X.columns) +for i,j in enumerate(features): + print(Columns[i],"->",j) +plt.figure(figsize=(16,5)) +plt.title(label="DT") +plt.bar([x for x in range(len(features))],features) +plt.show() + +print("Feature importance by Suppoprt Vector Machine:->\n") +SVM = SVC(kernel="linear") +SVM.fit(X,y) +features = SVM.coef_[0] +Columns = list(X.columns) +for i,j in enumerate(features): + print(Columns[i],"->",j) +plt.figure(figsize=(16,5)) +plt.bar([x for x in range(len(features))],features) +plt.show() + +print("Feature importance by Logistic Regression:->\n") +LOGC = LogisticRegression() +LOGC.fit(X,y) +features = LOGC.coef_[0] +Columns = list(X.columns) +for i,j in enumerate(features): + print(Columns[i],"->",j) +plt.figure(figsize=(16,5)) +plt.title(label="LOGC") +plt.bar([x for x in range(len(features))],features) +plt.show() + + +# From feature importance => Credit History , ApplicantIncome , CoapplicantIncome, LoanAmount are the most important features + +# # Is data Balanced ? + +# In[18]: + + +#Heat map of dataset with relative importance +matrix = data.drop(["Gender","Married","Dependents","Education","Self_Employed"],axis=1).corr() +#f , ax = plt.subplots(figsize=(18,6)) +plt.figure(figsize=(18,8)) +sns.heatmap(matrix,vmax=0.8,square=True,cmap="BuPu") + + +# It seems Application income and Loan Amount is correlated , also Coapplication income correlated with Loan Aount then +# Credit history is corrleated with Loan Status + +# In[19]: + + +A = list(data.Loan_Status).count(1) +B = list(data.Loan_Status).count(0) +print("Count of 1: ",A,"\nCount of 0: ",B) + +fig = px.bar((A,B),x=["Approved","Rejected"],y=[A,B],color=[A,B]) +fig.show() + + +# It seems that data is highly Imbalanced. +# +# When the target classes does not have equal count then the data is considered as imbalanced data. +# +# From above graph it seems that dataset contains more records with Approved Loan_Status than Rejected Loan_Status. +# 422 over 192 +# +# If data would have maximum of 20-30 records difference that time this imabalnced would be ignorable. +# +# Which will lead to make wrong assumptions by model and also model will be biased after training. +# We will overcome this issue by balancing the data. +# +# To overcome this problem we will balance the data using Resampling technique with Upsample and Downsample. + +# In[20]: + + +#To keep original data as it is to use the same for later. +new_data = data.copy() + +#Getting seperated data with 1 and 0 status. +df_majority = new_data[new_data.Loan_Status==1] +df_minority = new_data[new_data.Loan_Status==0] + +#Here we are downsampling the Majority Class Data Points. +#i.e. We will get equal amount of datapoint as Minority class from Majority class + +df_manjority_downsampled = resample(df_majority,replace=False,n_samples=192,random_state=123) +df_downsampled = pd.concat([df_manjority_downsampled,df_minority]) +print("Downsampled data:->\n",df_downsampled.Loan_Status.value_counts()) + +#Here we are upsampling the Minority Class Data Points. +#i.e. We will get equal amount of datapoint as Majority class from Minority class +df_monority_upsampled = resample(df_minority,replace=True,n_samples=422,random_state=123) +df_upsampled = pd.concat([df_majority,df_monority_upsampled]) +print("Upsampled data:->\n",df_upsampled.Loan_Status.value_counts()) + + +# In[ ]: + + + + + +# # Data Standardization / Normalization +# +# Data normalization is required when the vriable values are in very distinct range. +# +# For Ex. Suppose we have 2 columns "Age" and "Income" +# +# Where value range of "Age" lying in 0-100 Approx. +# and value range of "Income" lying in 20,000 to 100,000 +# +# At this time model will perform poorly on testig data as all input values are not in same value range. +# +# So not every time but whenever we get such type of data we need to normalized it i.e. Rescale it. +# +# Widely used scaling tools are Min-Max Scaler and Standard-Scaler +# +# Data Normalization is done by Min-Max Scaler which scales all th values between 0 to 1 range. +# +# Data standardization is done by Standard-Scaler which scales the data so that Mean of observed data is 0 and Standard Deviation is 1. +# +# As our data is not much normally distributed we will choose Standardization using Standard-Scaler aiming that it will reduce +# more skewness and contribute in accuracy gain. + +# In[ ]: + + + + + +# # Experimental Modeling +# +# In order to gain maximum posible accuracy one needs to conduct much emor experiments. +# +# We will pass data on by one with different state i.e. +# +# -Only Scaled data +# +# -Scaled + Down Sampled Data +# +# -Scaled + Up Sampled Data +# +# -Scaled + Up Sampled Data + Selected feature with respective importance. + +# In[21]: + + +#Experiment 1: Only Scaled data with all variables + +#X = new_data.drop(["Loan_ID","Gender","Married","Education","Self_Employed","Loan_Amount_Term","Loan_Status",'Property_Area'],axis=1) +X = new_data.drop(["Loan_Status","Loan_ID"],axis=1) +y = new_data["Loan_Status"] +counter = Counter(y) +print("Counter: ",counter) + +X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=0) + +#Scaling data here:-------------> + +StSc = StandardScaler() +X_train = StSc.fit_transform(X_train) +X_test = StSc.fit_transform(X_test) + +#Check mean is 0 and Standard deviation is 1 +print("After Standardization\nMean ",np.mean(X_train),"Standard Deviation ",np.std(X_train),"\n") + +#Voting ensemble mathod. Combining all tree based algorithms. +models = [] +models.append(("XGB",XGBClassifier())) +models.append(("RF",RandomForestClassifier())) +models.append(("DT",DecisionTreeClassifier())) +models.append(("ADB",AdaBoostClassifier())) +models.append(("GB",GradientBoostingClassifier())) + +ensemble = VotingClassifier(estimators=models) +ensemble.fit(X_train,y_train) +y_pred = ensemble.predict(X_test) +print(classification_report(y_pred,y_test)) +print("Voting Ensemble:>",accuracy_score(y_pred,y_test)) + + + +SVM = SVC(kernel="linear",class_weight="balanced",probability=True) +SVM.fit(X_train,y_train) +y_pred = SVM.predict(X_test) +print(classification_report(y_pred,y_test)) +print("SVM:>",accuracy_score(y_pred,y_test)) + + +XGBC = XGBClassifier(learning_rate =0.1,n_estimators=10000,max_depth=4,min_child_weight=6,gamma=0,subsample=0.6,colsample_bytree=0.8, + reg_alpha=0.005, objective= 'binary:logistic', nthread=2, scale_pos_weight=1, seed=27) +XGBC.fit(X_train,y_train) +y_pred = XGBC.predict(X_test) +print(classification_report(y_pred,y_test)) +print("XGBoost:>",accuracy_score(y_pred,y_test)) + +Model1 = RandomForestClassifier(n_estimators=1000,random_state=0,n_jobs=1000,max_depth=70,bootstrap=True) +Model1.fit(X_train,y_train) +y_pred = Model1.predict(X_test) +print(classification_report(y_pred,y_test)) +print("RandomForestClassifier:>",accuracy_score(y_pred,y_test)) + + +Model2 = GradientBoostingClassifier() +Model2.fit(X_train,y_train) +y_pred = Model2.predict(X_test) +print(classification_report(y_pred,y_test)) +print("GradientBoostingClassifier:>",accuracy_score(y_pred,y_test)) + + +Model3 = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=100, + max_features=1.0, max_leaf_nodes=10, + min_impurity_split=1e-07, min_samples_leaf=1, + min_samples_split=2, min_weight_fraction_leaf=0.10, + presort=False, random_state=27, splitter='best') +Model3.fit(X_train,y_train) +y_pred = Model3.predict(X_test) +print(classification_report(y_pred,y_test)) +print("DecisionTreeClassifier:>",accuracy_score(y_pred,y_test)) + + +Model4 = AdaBoostClassifier() +Model4.fit(X_train,y_train) +y_pred = Model4.predict(X_test) +print(classification_report(y_pred,y_test)) +print("AdaBoostClassifier:>",accuracy_score(y_pred,y_test)) + + +Model5 = LinearDiscriminantAnalysis() +Model5.fit(X_train,y_train) +y_pred = Model5.predict(X_test) +print(classification_report(y_pred,y_test)) +print("LinearDiscriminantAnalysis:>",accuracy_score(y_pred,y_test),"\n") + + +KNN = KNeighborsClassifier(leaf_size=1,p=2,n_neighbors=20) +KNN.fit(X_train,y_train) +y_pred = KNN.predict(X_test) +print(classification_report(y_pred,y_test)) +print("KNeighborsClassifier:>",accuracy_score(y_pred,y_test)) + + +Model7 = GaussianNB() +Model7.fit(X_train,y_train) +y_pred = Model7.predict(X_test) +print(classification_report(y_pred,y_test)) +print("GaussianNB:>",accuracy_score(y_pred,y_test)) + + +Model8 = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, + intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, + penalty='l2', random_state=None, solver='liblinear', tol=0.0001, + verbose=0, warm_start=False) +Model8.fit(X_train,y_train) +y_pred = Model8.predict(X_test) +print(classification_report(y_pred,y_test)) +print("Logistic Regression:>",accuracy_score(y_pred,y_test)) + + +# In[22]: + + +#Experiment 2: Sclaed + Down Sampled Data + +#X = df_downsampled.drop(["Loan_ID","Gender","Married","Education","Self_Employed","Loan_Amount_Term","Loan_Status",'Property_Area'],axis=1) +X = df_downsampled.drop(["Loan_Status","Loan_ID"],axis=1) +y = df_downsampled.Loan_Status +X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=0) + + +#Scaling data here:-------------> + +StSc = StandardScaler() +X_train = StSc.fit_transform(X_train) +X_test = StSc.fit_transform(X_test) + +#Check mean is 0 and Standard deviation is 1 +print("After Standardization\nMean ",np.mean(X_train),"Standard Deviation ",np.std(X_train),"\n") + +#Voting ensemble mathod. Combining all tree based algorithms. +models = [] +models.append(("XGB",XGBClassifier())) +models.append(("RF",RandomForestClassifier())) +models.append(("DT",DecisionTreeClassifier())) +models.append(("ADB",AdaBoostClassifier())) +models.append(("GB",GradientBoostingClassifier())) + +ensemble = VotingClassifier(estimators=models) +ensemble.fit(X_train,y_train) +y_pred = ensemble.predict(X_test) +print(classification_report(y_pred,y_test)) +print("Voting Ensemble:>",accuracy_score(y_pred,y_test)) + + + +SVM = SVC(kernel="linear",class_weight="balanced",probability=True) +SVM.fit(X_train,y_train) +y_pred = SVM.predict(X_test) +print(classification_report(y_pred,y_test)) +print("SVM:>",accuracy_score(y_pred,y_test)) + + +XGBC = XGBClassifier(learning_rate =0.1,n_estimators=10000,max_depth=4,min_child_weight=6,gamma=0,subsample=0.6,colsample_bytree=0.8, + reg_alpha=0.005, objective= 'binary:logistic', nthread=2, scale_pos_weight=1, seed=27) +XGBC.fit(X_train,y_train) +y_pred = XGBC.predict(X_test) +print(classification_report(y_pred,y_test)) +print("XGBoost:>",accuracy_score(y_pred,y_test)) + +Model1 = RandomForestClassifier(n_estimators=1000,random_state=0,n_jobs=1000,max_depth=70,bootstrap=True) +Model1.fit(X_train,y_train) +y_pred = Model1.predict(X_test) +print(classification_report(y_pred,y_test)) +print("RandomForestClassifier:>",accuracy_score(y_pred,y_test)) + + +Model2 = GradientBoostingClassifier() +Model2.fit(X_train,y_train) +y_pred = Model2.predict(X_test) +print(classification_report(y_pred,y_test)) +print("GradientBoostingClassifier:>",accuracy_score(y_pred,y_test)) + + +Model3 = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=100, + max_features=1.0, max_leaf_nodes=10, + min_impurity_split=1e-07, min_samples_leaf=1, + min_samples_split=2, min_weight_fraction_leaf=0.10, + presort=False, random_state=27, splitter='best') +Model3.fit(X_train,y_train) +y_pred = Model3.predict(X_test) +print(classification_report(y_pred,y_test)) +print("DecisionTreeClassifier:>",accuracy_score(y_pred,y_test)) + + +Model4 = AdaBoostClassifier() +Model4.fit(X_train,y_train) +y_pred = Model4.predict(X_test) +print(classification_report(y_pred,y_test)) +print("AdaBoostClassifier:>",accuracy_score(y_pred,y_test)) + + +Model5 = LinearDiscriminantAnalysis() +Model5.fit(X_train,y_train) +y_pred = Model5.predict(X_test) +print(classification_report(y_pred,y_test)) +print("LinearDiscriminantAnalysis:>",accuracy_score(y_pred,y_test)) + +KNN = KNeighborsClassifier(leaf_size=1,p=2,n_neighbors=20) +KNN.fit(X_train,y_train) +y_pred = KNN.predict(X_test) +print(classification_report(y_pred,y_test)) +print("KNeighborsClassifier:>",accuracy_score(y_pred,y_test)) + + +Model7 = GaussianNB() +Model7.fit(X_train,y_train) +y_pred = Model7.predict(X_test) +print(classification_report(y_pred,y_test)) +print("GaussianNB:>",accuracy_score(y_pred,y_test)) + + +Model8 = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, + intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, + penalty='l2', random_state=None, solver='liblinear', tol=0.0001, + verbose=0, warm_start=False) +Model8.fit(X_train,y_train) +y_pred = Model8.predict(X_test) +print(classification_report(y_pred,y_test)) +print("Logistic Regression:>",accuracy_score(y_pred,y_test)) + + +# In[23]: + + +#Experiment 3: Sclaed + Up Sampled Data + +#X = df_upsampled.drop(["Loan_ID","Gender","Married","Education","Self_Employed","Loan_Amount_Term","Loan_Status",'Property_Area'],axis=1) +X = df_upsampled.drop(["Loan_Status","Loan_ID"],axis=1) +y = df_upsampled.Loan_Status +print(len(X),len(y)) +X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=0) + +#Scaling data here:-------------> + +StSc = StandardScaler() +X_train = StSc.fit_transform(X_train) +X_test = StSc.fit_transform(X_test) + +#Check mean is 0 and Standard deviation is 1 +print("After Standardization\nMean ",np.mean(X_train),"Standard Deviation ",np.std(X_train),"\n") + +#Voting ensemble mathod. Combining all tree based algorithms. +models = [] +models.append(("XGB",XGBClassifier())) +models.append(("RF",RandomForestClassifier())) +models.append(("DT",DecisionTreeClassifier())) +models.append(("ADB",AdaBoostClassifier())) +models.append(("GB",GradientBoostingClassifier())) + +ensemble = VotingClassifier(estimators=models) +ensemble.fit(X_train,y_train) +y_pred = ensemble.predict(X_test) +print(classification_report(y_pred,y_test)) +print("Voting Ensemble:>",accuracy_score(y_pred,y_test)) + + + +SVM = SVC(kernel="linear",class_weight="balanced",probability=True) +SVM.fit(X_train,y_train) +y_pred = SVM.predict(X_test) +print(classification_report(y_pred,y_test)) +print("SVM:>",accuracy_score(y_pred,y_test)) + + +XGBC = XGBClassifier(learning_rate =0.1,n_estimators=10000,max_depth=4,min_child_weight=6,gamma=0,subsample=0.6,colsample_bytree=0.8, + reg_alpha=0.005, objective= 'binary:logistic', nthread=2, scale_pos_weight=1, seed=27) +XGBC.fit(X_train,y_train) +y_pred = XGBC.predict(X_test) +print(classification_report(y_pred,y_test)) +print("XGBoost:>",accuracy_score(y_pred,y_test)) + +Model1 = RandomForestClassifier(n_estimators=1000,random_state=0,n_jobs=1000,max_depth=70,bootstrap=True) +Model1.fit(X_train,y_train) +y_pred = Model1.predict(X_test) +print(classification_report(y_pred,y_test)) +print("RandomForestClassifier:>",accuracy_score(y_pred,y_test)) + + +Model2 = GradientBoostingClassifier() +Model2.fit(X_train,y_train) +y_pred = Model2.predict(X_test) +print(classification_report(y_pred,y_test)) +print("GradientBoostingClassifier:>",accuracy_score(y_pred,y_test)) + + +Model3 = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=100, + max_features=1.0, max_leaf_nodes=10, + min_impurity_split=1e-07, min_samples_leaf=1, + min_samples_split=2, min_weight_fraction_leaf=0.10, + presort=False, random_state=27, splitter='best') +Model3.fit(X_train,y_train) +y_pred = Model3.predict(X_test) +print(classification_report(y_pred,y_test)) +print("DecisionTreeClassifier:>",accuracy_score(y_pred,y_test)) + + +Model4 = AdaBoostClassifier() +Model4.fit(X_train,y_train) +y_pred = Model4.predict(X_test) +print(classification_report(y_pred,y_test)) +print("AdaBoostClassifier:>",accuracy_score(y_pred,y_test)) + + +Model5 = LinearDiscriminantAnalysis() +Model5.fit(X_train,y_train) +y_pred = Model5.predict(X_test) +print(classification_report(y_pred,y_test)) +print("LinearDiscriminantAnalysis:>",accuracy_score(y_pred,y_test)) + +KNN = KNeighborsClassifier(leaf_size=1,p=2,n_neighbors=20) +KNN.fit(X_train,y_train) +y_pred = KNN.predict(X_test) +print(classification_report(y_pred,y_test)) +print("KNeighborsClassifier:>",accuracy_score(y_pred,y_test)) + + +Model7 = GaussianNB() +Model7.fit(X_train,y_train) +y_pred = Model7.predict(X_test) +print(classification_report(y_pred,y_test)) +print("GaussianNB:>",accuracy_score(y_pred,y_test)) + + +Model8 = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, + intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, + penalty='l2', random_state=None, solver='liblinear', tol=0.0001, + verbose=0, warm_start=False) +Model8.fit(X_train,y_train) +y_pred = Model8.predict(X_test) +print(classification_report(y_pred,y_test)) +print("Logistic Regression:>",accuracy_score(y_pred,y_test)) + + +# In[24]: + + +# Experiment 4: Sclaed + Selected features with respective importance +#Droping features which are less important and keeping features as per importance analysis. +X = new_data.drop(["Loan_ID","Gender","Married","Education","Self_Employed","Loan_Amount_Term","Loan_Status","Property_Area"],axis=1) +#X = new_data.drop(["Loan_Status","Loan_ID"],axis=1) +y = new_data.Loan_Status +print(len(X),len(y)) +X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=0) + +#Scaling data here:-------------> + +StSc = StandardScaler() +X_train = StSc.fit_transform(X_train) +X_test = StSc.fit_transform(X_test) + +#Check mean is 0 and Standard deviation is 1 +print("After Standardization\nMean ",np.mean(X_train),"Standard Deviation ",np.std(X_train),"\n") + +#Voting ensemble mathod. Combining all tree based algorithms. +models = [] +models.append(("XGB",XGBClassifier())) +models.append(("RF",RandomForestClassifier())) +models.append(("DT",DecisionTreeClassifier())) +models.append(("ADB",AdaBoostClassifier())) +models.append(("GB",GradientBoostingClassifier())) + +ensemble = VotingClassifier(estimators=models) +ensemble.fit(X_train,y_train) +y_pred = ensemble.predict(X_test) +print(classification_report(y_pred,y_test)) +print("Voting Ensemble:>",accuracy_score(y_pred,y_test)) + + + +SVM = SVC(kernel="linear",class_weight="balanced",probability=True) +SVM.fit(X_train,y_train) +y_pred = SVM.predict(X_test) +print(classification_report(y_pred,y_test)) +print("SVM:>",accuracy_score(y_pred,y_test)) + + +XGBC = XGBClassifier(learning_rate =0.1,n_estimators=10000,max_depth=4,min_child_weight=6,gamma=0,subsample=0.6,colsample_bytree=0.8, + reg_alpha=0.005, objective= 'binary:logistic', nthread=2, scale_pos_weight=1, seed=27) +XGBC.fit(X_train,y_train) +y_pred = XGBC.predict(X_test) +print(classification_report(y_pred,y_test)) +print("XGBoost:>",accuracy_score(y_pred,y_test)) + +Model1 = RandomForestClassifier(n_estimators=1000,random_state=0,n_jobs=1000,max_depth=70,bootstrap=True) +Model1.fit(X_train,y_train) +y_pred = Model1.predict(X_test) +print(classification_report(y_pred,y_test)) +print("RandomForestClassifier:>",accuracy_score(y_pred,y_test)) + + +Model2 = GradientBoostingClassifier() +Model2.fit(X_train,y_train) +y_pred = Model2.predict(X_test) +print(classification_report(y_pred,y_test)) +print("GradientBoostingClassifier:>",accuracy_score(y_pred,y_test)) + + +Model3 = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=100, + max_features=1.0, max_leaf_nodes=10, + min_impurity_split=1e-07, min_samples_leaf=1, + min_samples_split=2, min_weight_fraction_leaf=0.10, + presort=False, random_state=27, splitter='best') +Model3.fit(X_train,y_train) +y_pred = Model3.predict(X_test) +print(classification_report(y_pred,y_test)) +print("DecisionTreeClassifier:>",accuracy_score(y_pred,y_test)) + + +Model4 = AdaBoostClassifier() +Model4.fit(X_train,y_train) +y_pred = Model4.predict(X_test) +print(classification_report(y_pred,y_test)) +print("AdaBoostClassifier:>",accuracy_score(y_pred,y_test)) + + +Model5 = LinearDiscriminantAnalysis() +Model5.fit(X_train,y_train) +y_pred = Model5.predict(X_test) +print(classification_report(y_pred,y_test)) +print("LinearDiscriminantAnalysis:>",accuracy_score(y_pred,y_test)) + +KNN = KNeighborsClassifier(leaf_size=1,p=2,n_neighbors=20) +KNN.fit(X_train,y_train) +y_pred = KNN.predict(X_test) +print(classification_report(y_pred,y_test)) +print("KNeighborsClassifier:>",accuracy_score(y_pred,y_test)) + + +Model7 = GaussianNB() +Model7.fit(X_train,y_train) +y_pred = Model7.predict(X_test) +print(classification_report(y_pred,y_test)) +print("GaussianNB:>",accuracy_score(y_pred,y_test)) + + +Model8 = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, + intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, + penalty='l2', random_state=None, solver='liblinear', tol=0.0001, + verbose=0, warm_start=False) +Model8.fit(X_train,y_train) +y_pred = Model8.predict(X_test) +print(classification_report(y_pred,y_test)) +print("Logistic Regression:>",accuracy_score(y_pred,y_test)) + + +# In[25]: + + + +#Hyperparameters tuning for KNN + +#X = new_data.drop(["Loan_ID","Gender","Married","Education","Self_Employed","Loan_Amount_Term","Loan_Status","Property_Area"],axis=1) +X = new_data.drop(["Loan_Status","Loan_ID"],axis=1) +y = new_data.Loan_Status +print(len(X),len(y)) +X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=0) + + + +leaf_size = list(range(1,50)) +n_neighbors = list(range(1,30)) +p=[1,2] +#Convert to dictionary +hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p) +#Create new KNN object +knn_2 = KNeighborsClassifier() +#Use GridSearch +clf = GridSearchCV(knn_2, hyperparameters, cv=10) +#Fit the model +best_model = clf.fit(X_train,y_train) +#Print The value of best Hyperparameters +print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size']) +print('Best p:', best_model.best_estimator_.get_params()['p']) +print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors']) + +LS = best_model.best_estimator_.get_params()['leaf_size'] +P = best_model.best_estimator_.get_params()['p'] +Num = best_model.best_estimator_.get_params()['n_neighbors'] + +KNN = KNeighborsClassifier(leaf_size=LS,p=P,n_neighbors=Num) +KNN.fit(X_train,y_train) +y_pred = KNN.predict(X_test) +print(classification_report(y_pred,y_test)) +print("KNeighborsClassifier:>",accuracy_score(y_pred,y_test)) + + +# In[ ]: + + + + + +# In[26]: + + +# Tuning SVM parameters + +#X = new_data.drop(["Loan_ID","Gender","Married","Education","Self_Employed","Loan_Amount_Term","Loan_Status","Property_Area"],axis=1) +X = new_data.drop(["Loan_Status","Loan_ID"],axis=1) +y = new_data.Loan_Status +print(len(X),len(y)) +X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=0) + + +model = SVC() +kernel = ['poly', 'rbf', 'sigmoid'] +C = [50, 10, 1.0, 0.1, 0.01] +gamma = ['scale'] +# define grid search +grid = dict(kernel=kernel,C=C,gamma=gamma) +cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) +grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0) +grid_result = grid_search.fit(X, y) +# summarize results +print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) +means = grid_result.cv_results_['mean_test_score'] +stds = grid_result.cv_results_['std_test_score'] +params = grid_result.cv_results_['params'] +for mean, stdev, param in zip(means, stds, params): + print("%f (%f) with: %r" % (mean, stdev, param)) + + +# # Conclusion +# Resut Summary is as below:-----> +# Algorithm : Accuracy +# +# Experiment 1 : Scaled data only +# +# Support Vector Machine 83.116 +# Decision Tree 83.1168 +# Linear Discriminant Analysis 83.166 +# KNearest Neighbors 83.766 +# Gaussian Naivey Bayes 83.116 +# Logistic Regression 83.116 +# +# Experiment 2: Sclaed + Down Sampled Data +# +# AdaBoost 73.95 +# Decision Tree 72.91 +# Voting Ensemble 71.87 +# +# +# Experiment 3: Sclaed + Up Sampled Data +# +# Random Forest only 83.88 +# +# Experiment 4: Sclaed + Selected features with respective importance +# +# Support Vector Machine 83.11 +# Decision Tree 83.11 +# AdaBoost 82.46 +# Linear Discriminant Analysis 83.11 +# KNearest Neighbors 83.11 +# Gaussian Naivey Bayes 83.11 +# Logistic Regression 83.11 +# +# Also after parameter tuning with +# +# KNN 83.11 +# +# After all possible experiments Maximum accuracy achieved By making data balanced as Up Sampling. Surprisingly only +# Random forest performed well in that state of the data. +# +# Surprisingly feature selection doesn't make increase in accuracy. +# +# + +# In[ ]: + + + +