diff --git a/Loan_Approval_Prediction.html b/Loan_Approval_Prediction.html new file mode 100644 index 0000000..7410f02 --- /dev/null +++ b/Loan_Approval_Prediction.html @@ -0,0 +1,17338 @@ + + +
+ +Type: Binary Classification +Loan approval prediction is classic problem to learn and apply lots of data analysis techniques to +create best Classification model.
+Given with the dataset consisting of details of applicants for loan and status whether the loan application is approved or not. +Basis on the a binary classification model is to be created with maximum accuracy.
+ +#Basic and most important libraries
+import pandas as pd , numpy as np
+from sklearn.utils import resample
+from sklearn.preprocessing import StandardScaler , MinMaxScaler
+from collections import Counter
+from scipy import stats
+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.express as px
+import plotly.figure_factory as ff
+import plotly
+
+#Classifiers
+from sklearn.ensemble import AdaBoostClassifier , GradientBoostingClassifier , VotingClassifier , RandomForestClassifier
+from sklearn.linear_model import LogisticRegression , RidgeClassifier
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.model_selection import RepeatedStratifiedKFold
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.model_selection import GridSearchCV
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.naive_bayes import GaussianNB
+from xgboost import plot_importance
+from xgboost import XGBClassifier
+from sklearn.svm import SVC
+
+#Model evaluation tools
+from sklearn.metrics import classification_report , accuracy_score , confusion_matrix
+from sklearn.metrics import accuracy_score,f1_score
+from sklearn.model_selection import cross_val_score
+
+#Data processing functions
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+from sklearn import model_selection
+from sklearn.preprocessing import LabelEncoder
+le = LabelEncoder()
+
+import warnings
+warnings.filterwarnings("ignore")
+
data = pd.read_csv(r"C:\Master\Learning\Analytics_Vidhya\Loan_Prediction-Hackathon\train.csv")
+data.head(5)
+
sns.pairplot(data)
+plt.show()
+
data.describe()
+
data.info()
+
fig = px.scatter_matrix(data["ApplicantIncome"])
+fig.update_layout(width=700,height=400)
+fig.show()
+
Seems need to work on data preperation
+-Loan Amount column does is not fit in Normal Distribution
+-Outliers in Applicant's Income and Co-applicant's income
+ +data.isnull().sum()
+
plt.figure(figsize=(10,6))
+sns.heatmap(data.isnull())
+
Central limit theorem +In simple language we can say that maximum amount of data / or maximum number of data points are near the Mean of the all +data points.
+To validate he normal distribution of the data:- +Mean Mode Median are Equal.\n
+We can gen identified the distribution of entire data with the help of Mean and Standard Deviation.
+When the data is normally distributed maximum data is centralized near the mean value of the data.
+To get understanding of distribtuion we can simply plot Distribution plot i.e. Simple Histogram.
+Normally Distributed data represents a Bell Shaped curve.
+Also Mean , Mode , Median on Normaly Distributed data are equal (Mean=Mode=Median)
+One more method is to calculate mean which should be 0 or near to 0 and Standard deviation 1 or near 1.
+Mean = sum(All Data Points)/count(Data Points)
+Standard Deviation = Root of { sum [Square (each data point - mean of whole data) ] }
+ +#Checking if the non-categorical variables are Normally Distributed or Not. i.e. Checking outliers...
+
+print("Data distribution analysis:->---------------------------------------\n")
+print("\nMean:->\n")
+print("ApplicantIncome: ",np.mean(data["ApplicantIncome"]))
+print("CoapplicantIncome: ",np.mean(data["CoapplicantIncome"]))
+print("LoanAmount: ",np.mean(data["LoanAmount"]))
+
+print("\nMode:->\n")
+print("ApplicantIncome: ",stats.mode(data["ApplicantIncome"])[0])
+print("CoapplicantIncome: ",stats.mode(data["CoapplicantIncome"])[0])
+print("LoanAmount: ",stats.mode(data["LoanAmount"])[0])
+
+print("\nMedian:->\n")
+print("ApplicantIncome: ",np.median(data["ApplicantIncome"]))
+print("CoapplicantIncome: ",np.median(data["CoapplicantIncome"]))
+print("LoanAmount: ",np.median(data["LoanAmount"]))
+
+print("\nStandard Deviation:->\n")
+print("ApplicantIncome: ",np.std(data["ApplicantIncome"]))
+print("CoapplicantIncome: ",np.std(data["CoapplicantIncome"]))
+print("LoanAmount: ",np.std(data["LoanAmount"]))
+
+fig = px.histogram(data["ApplicantIncome"],x ="ApplicantIncome" ,y = "ApplicantIncome" )
+fig.update_layout(title="ApplicantIncome")
+fig.show()
+
+fig = px.histogram(data["CoapplicantIncome"],x ="CoapplicantIncome" ,y = "CoapplicantIncome" )
+fig.update_layout(title="CoapplicantIncome")
+fig.show()
+
+fig = px.histogram(data["LoanAmount"],x ="LoanAmount" ,y = "LoanAmount" )
+fig.update_layout(title="LoanAmount")
+fig.show()
+
From above graphs found these variables are not normaly distributed.
+Foud right-skewed distribution in these three variabels.
+ +plt.figure(figsize=(10,5))
+fig = px.bar(data,x=data["Gender"])
+fig.show()
+
+fig = px.bar(data,x=data["Married"])
+fig.show()
+
+fig = px.bar(data,x=data["Education"],color="Education")
+fig.show()
+
+fig = px.bar(data,x=data["Self_Employed"])
+fig.show()
+
+fig = px.bar(data,x=data["Dependents"])
+fig.show()
+
+fig = px.bar(data,x=data["Property_Area"])
+fig.show()
+
+fig = px.bar(data,x=data["Loan_Status"],color="Loan_Status")
+fig.show()
+
Prepare data for model training i.e. removing ouliers , filling null values , removing skewness
+ +print(data["Gender"].value_counts())
+print(data["Married"].value_counts())
+print(data["Self_Employed"].value_counts())
+print(data["Dependents"].value_counts())
+print(data["Credit_History"].value_counts())
+print(data["Loan_Amount_Term"].value_counts())
+
->Taking mode of values in a column will be best way to fill null values. +->Not mean because values are not ordinal but are categorical.
+ +#Filling all Nan values with mode of respective variable
+data["Gender"].fillna(data["Gender"].mode()[0],inplace=True)
+data["Married"].fillna(data["Married"].mode()[0],inplace=True)
+data["Self_Employed"].fillna(data["Self_Employed"].mode()[0],inplace=True)
+data["Loan_Amount_Term"].fillna(data["Loan_Amount_Term"].mode()[0],inplace=True)
+data["Dependents"].fillna(data["Dependents"].mode()[0],inplace=True)
+data["Credit_History"].fillna(data["Credit_History"].mode()[0],inplace=True)
+
+#All values of "Dependents" columns were of "str" form now converting to "int" form.
+data["Dependents"] = data["Dependents"].replace('3+',int(3))
+data["Dependents"] = data["Dependents"].replace('1',int(1))
+data["Dependents"] = data["Dependents"].replace('2',int(2))
+data["Dependents"] = data["Dependents"].replace('0',int(0))
+
+data["LoanAmount"].fillna(data["LoanAmount"].median(),inplace=True)
+
+print(data.isnull().sum())
+
+#Heat map for null values
+plt.figure(figsize=(10,6))
+sns.heatmap(data.isnull())
+
#Treating outliers and Converting data to Normal Distribution
+#Before removing outlier
+
+print("\nMean:->\n")
+print("ApplicantIncome: ",np.mean(data["ApplicantIncome"]))
+print("CoapplicantIncome: ",np.mean(data["CoapplicantIncome"]))
+print("LoanAmount: ",np.mean(data["LoanAmount"]))
+
+print("\nMode:->\n")
+print("ApplicantIncome: ",stats.mode(data["ApplicantIncome"])[0])
+print("CoapplicantIncome: ",stats.mode(data["CoapplicantIncome"])[0])
+print("LoanAmount: ",stats.mode(data["LoanAmount"])[0])
+
+print("\nMedian:->\n")
+print("ApplicantIncome: ",np.median(data["ApplicantIncome"]))
+print("CoapplicantIncome: ",np.median(data["CoapplicantIncome"]))
+print("LoanAmount: ",np.median(data["LoanAmount"]))
+
+print("\nStandard Deviation:->\n")
+print("ApplicantIncome: ",np.std(data["ApplicantIncome"]))
+print("CoapplicantIncome: ",np.std(data["CoapplicantIncome"]))
+print("LoanAmount: ",np.std(data["LoanAmount"]))
+
+fig = px.histogram(data["ApplicantIncome"],x ="ApplicantIncome" ,y = "ApplicantIncome" )
+fig.update_layout(title="ApplicantIncome")
+fig.show()
+
+fig = px.histogram(data["CoapplicantIncome"],x ="CoapplicantIncome" ,y = "CoapplicantIncome" )
+fig.update_layout(title="CoapplicantIncome")
+fig.show()
+
+fig = px.histogram(data["LoanAmount"],x ="LoanAmount" ,y = "LoanAmount" )
+fig.update_layout(title="LoanAmount")
+fig.show()
+
+####################################################################################################
+#Getting log value :->
+
+data["ApplicantIncome"] = np.log(data["ApplicantIncome"])
+#As "CoapplicantIncome" columns has some "0" values we will get log values except "0"
+data["CoapplicantIncome"] = [np.log(i) if i!=0 else 0 for i in data["CoapplicantIncome"]]
+data["LoanAmount"] = np.log(data["LoanAmount"])
+####################################################################################################
+
+print("---------------------------After converting to Normal Distributed data----------------------")
+
+print("\nMean:->\n")
+print("ApplicantIncome: ",np.mean(data["ApplicantIncome"]))
+print("CoapplicantIncome: ",np.mean(data["CoapplicantIncome"]))
+print("LoanAmount: ",np.mean(data["LoanAmount"]))
+
+print("\nMode:->\n")
+print("ApplicantIncome: ",stats.mode(data["ApplicantIncome"])[0])
+print("CoapplicantIncome: ",stats.mode(data["CoapplicantIncome"])[0])
+print("LoanAmount: ",stats.mode(data["LoanAmount"])[0])
+
+print("\nMedian:->\n")
+print("ApplicantIncome: ",np.median(data["ApplicantIncome"]))
+print("CoapplicantIncome: ",np.median(data["CoapplicantIncome"]))
+print("LoanAmount: ",np.median(data["LoanAmount"]))
+
+print("\nStandard Deviation:->\n")
+print("ApplicantIncome: ",np.std(data["ApplicantIncome"]))
+print("CoapplicantIncome: ",np.std(data["CoapplicantIncome"]))
+print("LoanAmount: ",np.std(data["LoanAmount"]))
+
+plt.figure(figsize=(10,4))
+fig = px.histogram(data["ApplicantIncome"],x ="ApplicantIncome" ,y = "ApplicantIncome" )
+fig.update_layout(title="ApplicantIncome")
+fig.show()
+
+fig = px.histogram(data["CoapplicantIncome"],x ="CoapplicantIncome" ,y = "CoapplicantIncome" )
+fig.update_layout(title="CoapplicantIncome")
+fig.show()
+
+fig = px.histogram(data["LoanAmount"],x ="LoanAmount" ,y = "LoanAmount" )
+fig.update_layout(title="LoanAmount")
+fig.show()
+
Now we can see that Bell Curve for all three variables and data is normally distributed now.
+ +data.head(5)
+
data["Gender"] = le.fit_transform(data["Gender"])
+data["Married"] = le.fit_transform(data["Married"])
+data["Education"] = le.fit_transform(data["Education"])
+data["Self_Employed"] = le.fit_transform(data["Self_Employed"])
+data["Property_Area"] = le.fit_transform(data["Property_Area"])
+data["Loan_Status"] = le.fit_transform(data["Loan_Status"])
+
+#data = pd.get_dummies(data)
+data.head(5)
+
+
+
In order to create best predictive model we need to best understand the available data and +get most information from the data.
+In multivariate data it is important to understand the iortance of varialbes and +how much they are contributing towards the target variable. Such that we can remove unnecessary variables to increase +model performance.
+Many times dataset consists of exta columns which do not identically serve information to classify the data. +This leads in Wrong Assumption of model while training.
+To understand the importance of the data we are going to use Machine Learning classifiers and +then will plot bar graph based on importance.
+Also XGBoost has built-in Feature Importance Plotting tool which we are going to use.
+Using more than one classifier will increase the confidence on our assumption of which variables to keep +and which to remove.
+ +#Dividing data into Input X variables and Target Y variable
+X = data.drop(["Loan_Status","Loan_ID"],axis=1)
+y = data["Loan_Status"]
+
print("Feature importance by XGBoost:->\n")
+XGBR = XGBClassifier()
+XGBR.fit(X,y)
+features = XGBR.feature_importances_
+Columns = list(X.columns)
+for i,j in enumerate(features):
+ print(Columns[i],"->",j)
+plt.figure(figsize=(16,5))
+plt.title(label="XGBC")
+plt.bar([x for x in range(len(features))],features)
+plt.show()
+
+plot_importance(XGBR)
+
+print("Feature importance by Random Forest:->\n")
+RF = RandomForestClassifier()
+RF.fit(X,y)
+features = RF.feature_importances_
+Columns = list(X.columns)
+for i,j in enumerate(features):
+ print(Columns[i],"->",j)
+plt.figure(figsize=(16,5))
+plt.title(label="RF")
+plt.bar([x for x in range(len(features))],features)
+plt.show()
+
+print("Feature importance by Decision Tree:->\n")
+DT = DecisionTreeClassifier()
+DT.fit(X,y)
+features = DT.feature_importances_
+Columns = list(X.columns)
+for i,j in enumerate(features):
+ print(Columns[i],"->",j)
+plt.figure(figsize=(16,5))
+plt.title(label="DT")
+plt.bar([x for x in range(len(features))],features)
+plt.show()
+
+print("Feature importance by Suppoprt Vector Machine:->\n")
+SVM = SVC(kernel="linear")
+SVM.fit(X,y)
+features = SVM.coef_[0]
+Columns = list(X.columns)
+for i,j in enumerate(features):
+ print(Columns[i],"->",j)
+plt.figure(figsize=(16,5))
+plt.bar([x for x in range(len(features))],features)
+plt.show()
+
+print("Feature importance by Logistic Regression:->\n")
+LOGC = LogisticRegression()
+LOGC.fit(X,y)
+features = LOGC.coef_[0]
+Columns = list(X.columns)
+for i,j in enumerate(features):
+ print(Columns[i],"->",j)
+plt.figure(figsize=(16,5))
+plt.title(label="LOGC")
+plt.bar([x for x in range(len(features))],features)
+plt.show()
+
From feature importance => Credit History , ApplicantIncome , CoapplicantIncome, LoanAmount are the most important features
+ +#Heat map of dataset with relative importance
+matrix = data.drop(["Gender","Married","Dependents","Education","Self_Employed"],axis=1).corr()
+#f , ax = plt.subplots(figsize=(18,6))
+plt.figure(figsize=(18,8))
+sns.heatmap(matrix,vmax=0.8,square=True,cmap="BuPu")
+
It seems Application income and Loan Amount is correlated , also Coapplication income correlated with Loan Aount then +Credit history is corrleated with Loan Status
+ +A = list(data.Loan_Status).count(1)
+B = list(data.Loan_Status).count(0)
+print("Count of 1<Approved>: ",A,"\nCount of 0<Rejected>: ",B)
+
+fig = px.bar((A,B),x=["Approved","Rejected"],y=[A,B],color=[A,B])
+fig.show()
+
It seems that data is highly Imbalanced.
+When the target classes does not have equal count then the data is considered as imbalanced data.
+From above graph it seems that dataset contains more records with Approved Loan_Status than Rejected Loan_Status. +422 over 192
+If data would have maximum of 20-30 records difference that time this imabalnced would be ignorable.
+Which will lead to make wrong assumptions by model and also model will be biased after training. +We will overcome this issue by balancing the data.
+To overcome this problem we will balance the data using Resampling technique with Upsample and Downsample.
+ +#To keep original data as it is to use the same for later.
+new_data = data.copy()
+
+#Getting seperated data with 1 and 0 status.
+df_majority = new_data[new_data.Loan_Status==1]
+df_minority = new_data[new_data.Loan_Status==0]
+
+#Here we are downsampling the Majority Class Data Points.
+#i.e. We will get equal amount of datapoint as Minority class from Majority class
+
+df_manjority_downsampled = resample(df_majority,replace=False,n_samples=192,random_state=123)
+df_downsampled = pd.concat([df_manjority_downsampled,df_minority])
+print("Downsampled data:->\n",df_downsampled.Loan_Status.value_counts())
+
+#Here we are upsampling the Minority Class Data Points.
+#i.e. We will get equal amount of datapoint as Majority class from Minority class
+df_monority_upsampled = resample(df_minority,replace=True,n_samples=422,random_state=123)
+df_upsampled = pd.concat([df_majority,df_monority_upsampled])
+print("Upsampled data:->\n",df_upsampled.Loan_Status.value_counts())
+
+
Data normalization is required when the vriable values are in very distinct range.
+For Ex. Suppose we have 2 columns "Age" and "Income"
+Where value range of "Age" lying in 0-100 Approx. +and value range of "Income" lying in 20,000 to 100,000
+At this time model will perform poorly on testig data as all input values are not in same value range.
+So not every time but whenever we get such type of data we need to normalized it i.e. Rescale it.
+Widely used scaling tools are Min-Max Scaler and Standard-Scaler
+Data Normalization is done by Min-Max Scaler which scales all th values between 0 to 1 range.
+Data standardization is done by Standard-Scaler which scales the data so that Mean of observed data is 0 and Standard Deviation is 1.
+As our data is not much normally distributed we will choose Standardization using Standard-Scaler aiming that it will reduce +more skewness and contribute in accuracy gain.
+ +
+
In order to gain maximum posible accuracy one needs to conduct much emor experiments.
+We will pass data on by one with different state i.e.
+-Only Scaled data
+-Scaled + Down Sampled Data
+-Scaled + Up Sampled Data
+-Scaled + Up Sampled Data + Selected feature with respective importance.
+ +#Experiment 1: Only Scaled data with all variables
+
+#X = new_data.drop(["Loan_ID","Gender","Married","Education","Self_Employed","Loan_Amount_Term","Loan_Status",'Property_Area'],axis=1)
+X = new_data.drop(["Loan_Status","Loan_ID"],axis=1)
+y = new_data["Loan_Status"]
+counter = Counter(y)
+print("Counter: ",counter)
+
+X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=0)
+
+#Scaling data here:------------->
+
+StSc = StandardScaler()
+X_train = StSc.fit_transform(X_train)
+X_test = StSc.fit_transform(X_test)
+
+#Check mean is 0 and Standard deviation is 1
+print("After Standardization\nMean ",np.mean(X_train),"Standard Deviation ",np.std(X_train),"\n")
+
+#Voting ensemble mathod. Combining all tree based algorithms.
+models = []
+models.append(("XGB",XGBClassifier()))
+models.append(("RF",RandomForestClassifier()))
+models.append(("DT",DecisionTreeClassifier()))
+models.append(("ADB",AdaBoostClassifier()))
+models.append(("GB",GradientBoostingClassifier()))
+
+ensemble = VotingClassifier(estimators=models)
+ensemble.fit(X_train,y_train)
+y_pred = ensemble.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("Voting Ensemble:>",accuracy_score(y_pred,y_test))
+
+
+
+SVM = SVC(kernel="linear",class_weight="balanced",probability=True)
+SVM.fit(X_train,y_train)
+y_pred = SVM.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("SVM:>",accuracy_score(y_pred,y_test))
+
+
+XGBC = XGBClassifier(learning_rate =0.1,n_estimators=10000,max_depth=4,min_child_weight=6,gamma=0,subsample=0.6,colsample_bytree=0.8,
+ reg_alpha=0.005, objective= 'binary:logistic', nthread=2, scale_pos_weight=1, seed=27)
+XGBC.fit(X_train,y_train)
+y_pred = XGBC.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("XGBoost:>",accuracy_score(y_pred,y_test))
+
+Model1 = RandomForestClassifier(n_estimators=1000,random_state=0,n_jobs=1000,max_depth=70,bootstrap=True)
+Model1.fit(X_train,y_train)
+y_pred = Model1.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("RandomForestClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model2 = GradientBoostingClassifier()
+Model2.fit(X_train,y_train)
+y_pred = Model2.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("GradientBoostingClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model3 = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=100,
+ max_features=1.0, max_leaf_nodes=10,
+ min_impurity_split=1e-07, min_samples_leaf=1,
+ min_samples_split=2, min_weight_fraction_leaf=0.10,
+ presort=False, random_state=27, splitter='best')
+Model3.fit(X_train,y_train)
+y_pred = Model3.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("DecisionTreeClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model4 = AdaBoostClassifier()
+Model4.fit(X_train,y_train)
+y_pred = Model4.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("AdaBoostClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model5 = LinearDiscriminantAnalysis()
+Model5.fit(X_train,y_train)
+y_pred = Model5.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("LinearDiscriminantAnalysis:>",accuracy_score(y_pred,y_test),"\n")
+
+
+KNN = KNeighborsClassifier(leaf_size=1,p=2,n_neighbors=20)
+KNN.fit(X_train,y_train)
+y_pred = KNN.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("KNeighborsClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model7 = GaussianNB()
+Model7.fit(X_train,y_train)
+y_pred = Model7.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("GaussianNB:>",accuracy_score(y_pred,y_test))
+
+
+Model8 = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
+ intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
+ penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
+ verbose=0, warm_start=False)
+Model8.fit(X_train,y_train)
+y_pred = Model8.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("Logistic Regression:>",accuracy_score(y_pred,y_test))
+
#Experiment 2: Sclaed + Down Sampled Data
+
+#X = df_downsampled.drop(["Loan_ID","Gender","Married","Education","Self_Employed","Loan_Amount_Term","Loan_Status",'Property_Area'],axis=1)
+X = df_downsampled.drop(["Loan_Status","Loan_ID"],axis=1)
+y = df_downsampled.Loan_Status
+X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=0)
+
+
+#Scaling data here:------------->
+
+StSc = StandardScaler()
+X_train = StSc.fit_transform(X_train)
+X_test = StSc.fit_transform(X_test)
+
+#Check mean is 0 and Standard deviation is 1
+print("After Standardization\nMean ",np.mean(X_train),"Standard Deviation ",np.std(X_train),"\n")
+
+#Voting ensemble mathod. Combining all tree based algorithms.
+models = []
+models.append(("XGB",XGBClassifier()))
+models.append(("RF",RandomForestClassifier()))
+models.append(("DT",DecisionTreeClassifier()))
+models.append(("ADB",AdaBoostClassifier()))
+models.append(("GB",GradientBoostingClassifier()))
+
+ensemble = VotingClassifier(estimators=models)
+ensemble.fit(X_train,y_train)
+y_pred = ensemble.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("Voting Ensemble:>",accuracy_score(y_pred,y_test))
+
+
+
+SVM = SVC(kernel="linear",class_weight="balanced",probability=True)
+SVM.fit(X_train,y_train)
+y_pred = SVM.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("SVM:>",accuracy_score(y_pred,y_test))
+
+
+XGBC = XGBClassifier(learning_rate =0.1,n_estimators=10000,max_depth=4,min_child_weight=6,gamma=0,subsample=0.6,colsample_bytree=0.8,
+ reg_alpha=0.005, objective= 'binary:logistic', nthread=2, scale_pos_weight=1, seed=27)
+XGBC.fit(X_train,y_train)
+y_pred = XGBC.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("XGBoost:>",accuracy_score(y_pred,y_test))
+
+Model1 = RandomForestClassifier(n_estimators=1000,random_state=0,n_jobs=1000,max_depth=70,bootstrap=True)
+Model1.fit(X_train,y_train)
+y_pred = Model1.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("RandomForestClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model2 = GradientBoostingClassifier()
+Model2.fit(X_train,y_train)
+y_pred = Model2.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("GradientBoostingClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model3 = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=100,
+ max_features=1.0, max_leaf_nodes=10,
+ min_impurity_split=1e-07, min_samples_leaf=1,
+ min_samples_split=2, min_weight_fraction_leaf=0.10,
+ presort=False, random_state=27, splitter='best')
+Model3.fit(X_train,y_train)
+y_pred = Model3.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("DecisionTreeClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model4 = AdaBoostClassifier()
+Model4.fit(X_train,y_train)
+y_pred = Model4.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("AdaBoostClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model5 = LinearDiscriminantAnalysis()
+Model5.fit(X_train,y_train)
+y_pred = Model5.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("LinearDiscriminantAnalysis:>",accuracy_score(y_pred,y_test))
+
+KNN = KNeighborsClassifier(leaf_size=1,p=2,n_neighbors=20)
+KNN.fit(X_train,y_train)
+y_pred = KNN.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("KNeighborsClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model7 = GaussianNB()
+Model7.fit(X_train,y_train)
+y_pred = Model7.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("GaussianNB:>",accuracy_score(y_pred,y_test))
+
+
+Model8 = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
+ intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
+ penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
+ verbose=0, warm_start=False)
+Model8.fit(X_train,y_train)
+y_pred = Model8.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("Logistic Regression:>",accuracy_score(y_pred,y_test))
+
#Experiment 3: Sclaed + Up Sampled Data
+
+#X = df_upsampled.drop(["Loan_ID","Gender","Married","Education","Self_Employed","Loan_Amount_Term","Loan_Status",'Property_Area'],axis=1)
+X = df_upsampled.drop(["Loan_Status","Loan_ID"],axis=1)
+y = df_upsampled.Loan_Status
+print(len(X),len(y))
+X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=0)
+
+#Scaling data here:------------->
+
+StSc = StandardScaler()
+X_train = StSc.fit_transform(X_train)
+X_test = StSc.fit_transform(X_test)
+
+#Check mean is 0 and Standard deviation is 1
+print("After Standardization\nMean ",np.mean(X_train),"Standard Deviation ",np.std(X_train),"\n")
+
+#Voting ensemble mathod. Combining all tree based algorithms.
+models = []
+models.append(("XGB",XGBClassifier()))
+models.append(("RF",RandomForestClassifier()))
+models.append(("DT",DecisionTreeClassifier()))
+models.append(("ADB",AdaBoostClassifier()))
+models.append(("GB",GradientBoostingClassifier()))
+
+ensemble = VotingClassifier(estimators=models)
+ensemble.fit(X_train,y_train)
+y_pred = ensemble.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("Voting Ensemble:>",accuracy_score(y_pred,y_test))
+
+
+
+SVM = SVC(kernel="linear",class_weight="balanced",probability=True)
+SVM.fit(X_train,y_train)
+y_pred = SVM.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("SVM:>",accuracy_score(y_pred,y_test))
+
+
+XGBC = XGBClassifier(learning_rate =0.1,n_estimators=10000,max_depth=4,min_child_weight=6,gamma=0,subsample=0.6,colsample_bytree=0.8,
+ reg_alpha=0.005, objective= 'binary:logistic', nthread=2, scale_pos_weight=1, seed=27)
+XGBC.fit(X_train,y_train)
+y_pred = XGBC.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("XGBoost:>",accuracy_score(y_pred,y_test))
+
+Model1 = RandomForestClassifier(n_estimators=1000,random_state=0,n_jobs=1000,max_depth=70,bootstrap=True)
+Model1.fit(X_train,y_train)
+y_pred = Model1.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("RandomForestClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model2 = GradientBoostingClassifier()
+Model2.fit(X_train,y_train)
+y_pred = Model2.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("GradientBoostingClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model3 = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=100,
+ max_features=1.0, max_leaf_nodes=10,
+ min_impurity_split=1e-07, min_samples_leaf=1,
+ min_samples_split=2, min_weight_fraction_leaf=0.10,
+ presort=False, random_state=27, splitter='best')
+Model3.fit(X_train,y_train)
+y_pred = Model3.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("DecisionTreeClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model4 = AdaBoostClassifier()
+Model4.fit(X_train,y_train)
+y_pred = Model4.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("AdaBoostClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model5 = LinearDiscriminantAnalysis()
+Model5.fit(X_train,y_train)
+y_pred = Model5.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("LinearDiscriminantAnalysis:>",accuracy_score(y_pred,y_test))
+
+KNN = KNeighborsClassifier(leaf_size=1,p=2,n_neighbors=20)
+KNN.fit(X_train,y_train)
+y_pred = KNN.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("KNeighborsClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model7 = GaussianNB()
+Model7.fit(X_train,y_train)
+y_pred = Model7.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("GaussianNB:>",accuracy_score(y_pred,y_test))
+
+
+Model8 = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
+ intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
+ penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
+ verbose=0, warm_start=False)
+Model8.fit(X_train,y_train)
+y_pred = Model8.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("Logistic Regression:>",accuracy_score(y_pred,y_test))
+
# Experiment 4: Sclaed + Selected features with respective importance
+#Droping features which are less important and keeping features as per importance analysis.
+X = new_data.drop(["Loan_ID","Gender","Married","Education","Self_Employed","Loan_Amount_Term","Loan_Status","Property_Area"],axis=1)
+#X = new_data.drop(["Loan_Status","Loan_ID"],axis=1)
+y = new_data.Loan_Status
+print(len(X),len(y))
+X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=0)
+
+#Scaling data here:------------->
+
+StSc = StandardScaler()
+X_train = StSc.fit_transform(X_train)
+X_test = StSc.fit_transform(X_test)
+
+#Check mean is 0 and Standard deviation is 1
+print("After Standardization\nMean ",np.mean(X_train),"Standard Deviation ",np.std(X_train),"\n")
+
+#Voting ensemble mathod. Combining all tree based algorithms.
+models = []
+models.append(("XGB",XGBClassifier()))
+models.append(("RF",RandomForestClassifier()))
+models.append(("DT",DecisionTreeClassifier()))
+models.append(("ADB",AdaBoostClassifier()))
+models.append(("GB",GradientBoostingClassifier()))
+
+ensemble = VotingClassifier(estimators=models)
+ensemble.fit(X_train,y_train)
+y_pred = ensemble.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("Voting Ensemble:>",accuracy_score(y_pred,y_test))
+
+
+
+SVM = SVC(kernel="linear",class_weight="balanced",probability=True)
+SVM.fit(X_train,y_train)
+y_pred = SVM.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("SVM:>",accuracy_score(y_pred,y_test))
+
+
+XGBC = XGBClassifier(learning_rate =0.1,n_estimators=10000,max_depth=4,min_child_weight=6,gamma=0,subsample=0.6,colsample_bytree=0.8,
+ reg_alpha=0.005, objective= 'binary:logistic', nthread=2, scale_pos_weight=1, seed=27)
+XGBC.fit(X_train,y_train)
+y_pred = XGBC.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("XGBoost:>",accuracy_score(y_pred,y_test))
+
+Model1 = RandomForestClassifier(n_estimators=1000,random_state=0,n_jobs=1000,max_depth=70,bootstrap=True)
+Model1.fit(X_train,y_train)
+y_pred = Model1.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("RandomForestClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model2 = GradientBoostingClassifier()
+Model2.fit(X_train,y_train)
+y_pred = Model2.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("GradientBoostingClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model3 = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=100,
+ max_features=1.0, max_leaf_nodes=10,
+ min_impurity_split=1e-07, min_samples_leaf=1,
+ min_samples_split=2, min_weight_fraction_leaf=0.10,
+ presort=False, random_state=27, splitter='best')
+Model3.fit(X_train,y_train)
+y_pred = Model3.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("DecisionTreeClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model4 = AdaBoostClassifier()
+Model4.fit(X_train,y_train)
+y_pred = Model4.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("AdaBoostClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model5 = LinearDiscriminantAnalysis()
+Model5.fit(X_train,y_train)
+y_pred = Model5.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("LinearDiscriminantAnalysis:>",accuracy_score(y_pred,y_test))
+
+KNN = KNeighborsClassifier(leaf_size=1,p=2,n_neighbors=20)
+KNN.fit(X_train,y_train)
+y_pred = KNN.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("KNeighborsClassifier:>",accuracy_score(y_pred,y_test))
+
+
+Model7 = GaussianNB()
+Model7.fit(X_train,y_train)
+y_pred = Model7.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("GaussianNB:>",accuracy_score(y_pred,y_test))
+
+
+Model8 = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
+ intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
+ penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
+ verbose=0, warm_start=False)
+Model8.fit(X_train,y_train)
+y_pred = Model8.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("Logistic Regression:>",accuracy_score(y_pred,y_test))
+
#Hyperparameters tuning for KNN
+
+#X = new_data.drop(["Loan_ID","Gender","Married","Education","Self_Employed","Loan_Amount_Term","Loan_Status","Property_Area"],axis=1)
+X = new_data.drop(["Loan_Status","Loan_ID"],axis=1)
+y = new_data.Loan_Status
+print(len(X),len(y))
+X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=0)
+
+
+
+leaf_size = list(range(1,50))
+n_neighbors = list(range(1,30))
+p=[1,2]
+#Convert to dictionary
+hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)
+#Create new KNN object
+knn_2 = KNeighborsClassifier()
+#Use GridSearch
+clf = GridSearchCV(knn_2, hyperparameters, cv=10)
+#Fit the model
+best_model = clf.fit(X_train,y_train)
+#Print The value of best Hyperparameters
+print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
+print('Best p:', best_model.best_estimator_.get_params()['p'])
+print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])
+
+LS = best_model.best_estimator_.get_params()['leaf_size']
+P = best_model.best_estimator_.get_params()['p']
+Num = best_model.best_estimator_.get_params()['n_neighbors']
+
+KNN = KNeighborsClassifier(leaf_size=LS,p=P,n_neighbors=Num)
+KNN.fit(X_train,y_train)
+y_pred = KNN.predict(X_test)
+print(classification_report(y_pred,y_test))
+print("KNeighborsClassifier:>",accuracy_score(y_pred,y_test))
+
+
# Tuning SVM parameters
+
+#X = new_data.drop(["Loan_ID","Gender","Married","Education","Self_Employed","Loan_Amount_Term","Loan_Status","Property_Area"],axis=1)
+X = new_data.drop(["Loan_Status","Loan_ID"],axis=1)
+y = new_data.Loan_Status
+print(len(X),len(y))
+X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=0)
+
+
+model = SVC()
+kernel = ['poly', 'rbf', 'sigmoid']
+C = [50, 10, 1.0, 0.1, 0.01]
+gamma = ['scale']
+# define grid search
+grid = dict(kernel=kernel,C=C,gamma=gamma)
+cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
+grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
+grid_result = grid_search.fit(X, y)
+# summarize results
+print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
+means = grid_result.cv_results_['mean_test_score']
+stds = grid_result.cv_results_['std_test_score']
+params = grid_result.cv_results_['params']
+for mean, stdev, param in zip(means, stds, params):
+ print("%f (%f) with: %r" % (mean, stdev, param))
+
Resut Summary is as below:-----> +Algorithm : Accuracy
+Experiment 1 : Scaled data only
+ +Support Vector Machine 83.116
+Decision Tree 83.1168
+Linear Discriminant Analysis 83.166
+KNearest Neighbors 83.766
+Gaussian Naivey Bayes 83.116
+Logistic Regression 83.116
+
+
+Experiment 2: Sclaed + Down Sampled Data
+ +AdaBoost 73.95
+Decision Tree 72.91
+Voting Ensemble 71.87
+
+
+
+Experiment 3: Sclaed + Up Sampled Data
+ +Random Forest only 83.88
+
+
+Experiment 4: Sclaed + Selected features with respective importance
+ +Support Vector Machine 83.11
+Decision Tree 83.11
+AdaBoost 82.46
+Linear Discriminant Analysis 83.11
+KNearest Neighbors 83.11
+Gaussian Naivey Bayes 83.11
+Logistic Regression 83.11
+
+
+Also after parameter tuning with
+ +KNN 83.11
+
+
+After all possible experiments Maximum accuracy achieved By making data balanced as Up Sampling. Surprisingly only +Random forest performed well in that state of the data.
+Surprisingly feature selection doesn't make increase in accuracy.
+ +
+