From d7e0fca07042a45f8db7fac07911025cfbc59192 Mon Sep 17 00:00:00 2001
From: Annette Donald Blackburn
<109121304+annette-blackburn@users.noreply.github.com>
Date: Thu, 17 Nov 2022 10:12:55 -0700
Subject: [PATCH] Deliverable 3
---
credit_risk_ensemble.ipynb | 998 +++++++++++++++++++++++++++++++++++++
1 file changed, 998 insertions(+)
create mode 100644 credit_risk_ensemble.ipynb
diff --git a/credit_risk_ensemble.ipynb b/credit_risk_ensemble.ipynb
new file mode 100644
index 0000000..7561987
--- /dev/null
+++ b/credit_risk_ensemble.ipynb
@@ -0,0 +1,998 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from pathlib import Path\n",
+ "from collections import Counter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.metrics import balanced_accuracy_score\n",
+ "from sklearn.metrics import confusion_matrix\n",
+ "from imblearn.metrics import classification_report_imbalanced"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Read the CSV and Perform Basic Data Cleaning"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# https://help.lendingclub.com/hc/en-us/articles/215488038-What-do-the-different-Note-statuses-mean-\n",
+ "\n",
+ "columns = [\n",
+ " \"loan_amnt\", \"int_rate\", \"installment\", \"home_ownership\",\n",
+ " \"annual_inc\", \"verification_status\", \"issue_d\", \"loan_status\",\n",
+ " \"pymnt_plan\", \"dti\", \"delinq_2yrs\", \"inq_last_6mths\",\n",
+ " \"open_acc\", \"pub_rec\", \"revol_bal\", \"total_acc\",\n",
+ " \"initial_list_status\", \"out_prncp\", \"out_prncp_inv\", \"total_pymnt\",\n",
+ " \"total_pymnt_inv\", \"total_rec_prncp\", \"total_rec_int\", \"total_rec_late_fee\",\n",
+ " \"recoveries\", \"collection_recovery_fee\", \"last_pymnt_amnt\", \"next_pymnt_d\",\n",
+ " \"collections_12_mths_ex_med\", \"policy_code\", \"application_type\", \"acc_now_delinq\",\n",
+ " \"tot_coll_amt\", \"tot_cur_bal\", \"open_acc_6m\", \"open_act_il\",\n",
+ " \"open_il_12m\", \"open_il_24m\", \"mths_since_rcnt_il\", \"total_bal_il\",\n",
+ " \"il_util\", \"open_rv_12m\", \"open_rv_24m\", \"max_bal_bc\",\n",
+ " \"all_util\", \"total_rev_hi_lim\", \"inq_fi\", \"total_cu_tl\",\n",
+ " \"inq_last_12m\", \"acc_open_past_24mths\", \"avg_cur_bal\", \"bc_open_to_buy\",\n",
+ " \"bc_util\", \"chargeoff_within_12_mths\", \"delinq_amnt\", \"mo_sin_old_il_acct\",\n",
+ " \"mo_sin_old_rev_tl_op\", \"mo_sin_rcnt_rev_tl_op\", \"mo_sin_rcnt_tl\", \"mort_acc\",\n",
+ " \"mths_since_recent_bc\", \"mths_since_recent_inq\", \"num_accts_ever_120_pd\", \"num_actv_bc_tl\",\n",
+ " \"num_actv_rev_tl\", \"num_bc_sats\", \"num_bc_tl\", \"num_il_tl\",\n",
+ " \"num_op_rev_tl\", \"num_rev_accts\", \"num_rev_tl_bal_gt_0\",\n",
+ " \"num_sats\", \"num_tl_120dpd_2m\", \"num_tl_30dpd\", \"num_tl_90g_dpd_24m\",\n",
+ " \"num_tl_op_past_12m\", \"pct_tl_nvr_dlq\", \"percent_bc_gt_75\", \"pub_rec_bankruptcies\",\n",
+ " \"tax_liens\", \"tot_hi_cred_lim\", \"total_bal_ex_mort\", \"total_bc_limit\",\n",
+ " \"total_il_high_credit_limit\", \"hardship_flag\", \"debt_settlement_flag\"\n",
+ "]\n",
+ "\n",
+ "target = [\"loan_status\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " loan_amnt | \n",
+ " int_rate | \n",
+ " installment | \n",
+ " home_ownership | \n",
+ " annual_inc | \n",
+ " verification_status | \n",
+ " issue_d | \n",
+ " loan_status | \n",
+ " pymnt_plan | \n",
+ " dti | \n",
+ " ... | \n",
+ " pct_tl_nvr_dlq | \n",
+ " percent_bc_gt_75 | \n",
+ " pub_rec_bankruptcies | \n",
+ " tax_liens | \n",
+ " tot_hi_cred_lim | \n",
+ " total_bal_ex_mort | \n",
+ " total_bc_limit | \n",
+ " total_il_high_credit_limit | \n",
+ " hardship_flag | \n",
+ " debt_settlement_flag | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10500.0 | \n",
+ " 0.1719 | \n",
+ " 375.35 | \n",
+ " RENT | \n",
+ " 66000.0 | \n",
+ " Source Verified | \n",
+ " Mar-2019 | \n",
+ " low_risk | \n",
+ " n | \n",
+ " 27.24 | \n",
+ " ... | \n",
+ " 85.7 | \n",
+ " 100.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 65687.0 | \n",
+ " 38199.0 | \n",
+ " 2000.0 | \n",
+ " 61987.0 | \n",
+ " N | \n",
+ " N | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 25000.0 | \n",
+ " 0.2000 | \n",
+ " 929.09 | \n",
+ " MORTGAGE | \n",
+ " 105000.0 | \n",
+ " Verified | \n",
+ " Mar-2019 | \n",
+ " low_risk | \n",
+ " n | \n",
+ " 20.23 | \n",
+ " ... | \n",
+ " 91.2 | \n",
+ " 50.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 271427.0 | \n",
+ " 60641.0 | \n",
+ " 41200.0 | \n",
+ " 49197.0 | \n",
+ " N | \n",
+ " N | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 20000.0 | \n",
+ " 0.2000 | \n",
+ " 529.88 | \n",
+ " MORTGAGE | \n",
+ " 56000.0 | \n",
+ " Verified | \n",
+ " Mar-2019 | \n",
+ " low_risk | \n",
+ " n | \n",
+ " 24.26 | \n",
+ " ... | \n",
+ " 66.7 | \n",
+ " 50.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 60644.0 | \n",
+ " 45684.0 | \n",
+ " 7500.0 | \n",
+ " 43144.0 | \n",
+ " N | \n",
+ " N | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 10000.0 | \n",
+ " 0.1640 | \n",
+ " 353.55 | \n",
+ " RENT | \n",
+ " 92000.0 | \n",
+ " Verified | \n",
+ " Mar-2019 | \n",
+ " low_risk | \n",
+ " n | \n",
+ " 31.44 | \n",
+ " ... | \n",
+ " 100.0 | \n",
+ " 50.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 99506.0 | \n",
+ " 68784.0 | \n",
+ " 19700.0 | \n",
+ " 76506.0 | \n",
+ " N | \n",
+ " N | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 22000.0 | \n",
+ " 0.1474 | \n",
+ " 520.39 | \n",
+ " MORTGAGE | \n",
+ " 52000.0 | \n",
+ " Not Verified | \n",
+ " Mar-2019 | \n",
+ " low_risk | \n",
+ " n | \n",
+ " 18.76 | \n",
+ " ... | \n",
+ " 100.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 219750.0 | \n",
+ " 25919.0 | \n",
+ " 27600.0 | \n",
+ " 20000.0 | \n",
+ " N | \n",
+ " N | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 86 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " loan_amnt int_rate installment home_ownership annual_inc \\\n",
+ "0 10500.0 0.1719 375.35 RENT 66000.0 \n",
+ "1 25000.0 0.2000 929.09 MORTGAGE 105000.0 \n",
+ "2 20000.0 0.2000 529.88 MORTGAGE 56000.0 \n",
+ "3 10000.0 0.1640 353.55 RENT 92000.0 \n",
+ "4 22000.0 0.1474 520.39 MORTGAGE 52000.0 \n",
+ "\n",
+ " verification_status issue_d loan_status pymnt_plan dti ... \\\n",
+ "0 Source Verified Mar-2019 low_risk n 27.24 ... \n",
+ "1 Verified Mar-2019 low_risk n 20.23 ... \n",
+ "2 Verified Mar-2019 low_risk n 24.26 ... \n",
+ "3 Verified Mar-2019 low_risk n 31.44 ... \n",
+ "4 Not Verified Mar-2019 low_risk n 18.76 ... \n",
+ "\n",
+ " pct_tl_nvr_dlq percent_bc_gt_75 pub_rec_bankruptcies tax_liens \\\n",
+ "0 85.7 100.0 0.0 0.0 \n",
+ "1 91.2 50.0 1.0 0.0 \n",
+ "2 66.7 50.0 0.0 0.0 \n",
+ "3 100.0 50.0 1.0 0.0 \n",
+ "4 100.0 0.0 0.0 0.0 \n",
+ "\n",
+ " tot_hi_cred_lim total_bal_ex_mort total_bc_limit \\\n",
+ "0 65687.0 38199.0 2000.0 \n",
+ "1 271427.0 60641.0 41200.0 \n",
+ "2 60644.0 45684.0 7500.0 \n",
+ "3 99506.0 68784.0 19700.0 \n",
+ "4 219750.0 25919.0 27600.0 \n",
+ "\n",
+ " total_il_high_credit_limit hardship_flag debt_settlement_flag \n",
+ "0 61987.0 N N \n",
+ "1 49197.0 N N \n",
+ "2 43144.0 N N \n",
+ "3 76506.0 N N \n",
+ "4 20000.0 N N \n",
+ "\n",
+ "[5 rows x 86 columns]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Load the data\n",
+ "file_path = Path('/Users/annettedblackburn/Desktop/Data_Analytics_Bootcamp/Module 17 - Supervised Machine Learning and Credit Risk/Challenge/Module-17-Challenge-Resources/LoanStats_2019Q1.csv')\n",
+ "df = pd.read_csv(file_path, skiprows=1)[:-2]\n",
+ "df = df.loc[:, columns].copy()\n",
+ "\n",
+ "# Drop the null columns where all values are null\n",
+ "df = df.dropna(axis='columns', how='all')\n",
+ "\n",
+ "# Drop the null rows\n",
+ "df = df.dropna()\n",
+ "\n",
+ "# Remove the `Issued` loan status\n",
+ "issued_mask = df['loan_status'] != 'Issued'\n",
+ "df = df.loc[issued_mask]\n",
+ "\n",
+ "# convert interest rate to numerical\n",
+ "df['int_rate'] = df['int_rate'].str.replace('%', '')\n",
+ "df['int_rate'] = df['int_rate'].astype('float') / 100\n",
+ "\n",
+ "\n",
+ "# Convert the target column values to low_risk and high_risk based on their values\n",
+ "x = {'Current': 'low_risk'} \n",
+ "df = df.replace(x)\n",
+ "\n",
+ "x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk') \n",
+ "df = df.replace(x)\n",
+ "\n",
+ "df.reset_index(inplace=True, drop=True)\n",
+ "\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Split the Data into Training and Testing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create our features\n",
+ "X = pd.get_dummies(df.drop(columns='loan_status'))\n",
+ "\n",
+ "# Create our target\n",
+ "y = df['loan_status']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " loan_amnt | \n",
+ " int_rate | \n",
+ " installment | \n",
+ " annual_inc | \n",
+ " dti | \n",
+ " delinq_2yrs | \n",
+ " inq_last_6mths | \n",
+ " open_acc | \n",
+ " pub_rec | \n",
+ " revol_bal | \n",
+ " ... | \n",
+ " issue_d_Mar-2019 | \n",
+ " pymnt_plan_n | \n",
+ " initial_list_status_f | \n",
+ " initial_list_status_w | \n",
+ " next_pymnt_d_Apr-2019 | \n",
+ " next_pymnt_d_May-2019 | \n",
+ " application_type_Individual | \n",
+ " application_type_Joint App | \n",
+ " hardship_flag_N | \n",
+ " debt_settlement_flag_N | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 6.881700e+04 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " ... | \n",
+ " 68817.000000 | \n",
+ " 68817.0 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.000000 | \n",
+ " 68817.0 | \n",
+ " 68817.0 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 16677.594562 | \n",
+ " 0.127718 | \n",
+ " 480.652863 | \n",
+ " 8.821371e+04 | \n",
+ " 21.778153 | \n",
+ " 0.217766 | \n",
+ " 0.497697 | \n",
+ " 12.587340 | \n",
+ " 0.126030 | \n",
+ " 17604.142828 | \n",
+ " ... | \n",
+ " 0.177238 | \n",
+ " 1.0 | \n",
+ " 0.123879 | \n",
+ " 0.876121 | \n",
+ " 0.383161 | \n",
+ " 0.616839 | \n",
+ " 0.860340 | \n",
+ " 0.139660 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 10277.348590 | \n",
+ " 0.048130 | \n",
+ " 288.062432 | \n",
+ " 1.155800e+05 | \n",
+ " 20.199244 | \n",
+ " 0.718367 | \n",
+ " 0.758122 | \n",
+ " 6.022869 | \n",
+ " 0.336797 | \n",
+ " 21835.880400 | \n",
+ " ... | \n",
+ " 0.381873 | \n",
+ " 0.0 | \n",
+ " 0.329446 | \n",
+ " 0.329446 | \n",
+ " 0.486161 | \n",
+ " 0.486161 | \n",
+ " 0.346637 | \n",
+ " 0.346637 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 1000.000000 | \n",
+ " 0.060000 | \n",
+ " 30.890000 | \n",
+ " 4.000000e+01 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 2.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 9000.000000 | \n",
+ " 0.088100 | \n",
+ " 265.730000 | \n",
+ " 5.000000e+04 | \n",
+ " 13.890000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 8.000000 | \n",
+ " 0.000000 | \n",
+ " 6293.000000 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 15000.000000 | \n",
+ " 0.118000 | \n",
+ " 404.560000 | \n",
+ " 7.300000e+04 | \n",
+ " 19.760000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 11.000000 | \n",
+ " 0.000000 | \n",
+ " 12068.000000 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 24000.000000 | \n",
+ " 0.155700 | \n",
+ " 648.100000 | \n",
+ " 1.040000e+05 | \n",
+ " 26.660000 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 16.000000 | \n",
+ " 0.000000 | \n",
+ " 21735.000000 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 40000.000000 | \n",
+ " 0.308400 | \n",
+ " 1676.230000 | \n",
+ " 8.797500e+06 | \n",
+ " 999.000000 | \n",
+ " 18.000000 | \n",
+ " 5.000000 | \n",
+ " 72.000000 | \n",
+ " 4.000000 | \n",
+ " 587191.000000 | \n",
+ " ... | \n",
+ " 1.000000 | \n",
+ " 1.0 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
8 rows × 95 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " loan_amnt int_rate installment annual_inc dti \\\n",
+ "count 68817.000000 68817.000000 68817.000000 6.881700e+04 68817.000000 \n",
+ "mean 16677.594562 0.127718 480.652863 8.821371e+04 21.778153 \n",
+ "std 10277.348590 0.048130 288.062432 1.155800e+05 20.199244 \n",
+ "min 1000.000000 0.060000 30.890000 4.000000e+01 0.000000 \n",
+ "25% 9000.000000 0.088100 265.730000 5.000000e+04 13.890000 \n",
+ "50% 15000.000000 0.118000 404.560000 7.300000e+04 19.760000 \n",
+ "75% 24000.000000 0.155700 648.100000 1.040000e+05 26.660000 \n",
+ "max 40000.000000 0.308400 1676.230000 8.797500e+06 999.000000 \n",
+ "\n",
+ " delinq_2yrs inq_last_6mths open_acc pub_rec \\\n",
+ "count 68817.000000 68817.000000 68817.000000 68817.000000 \n",
+ "mean 0.217766 0.497697 12.587340 0.126030 \n",
+ "std 0.718367 0.758122 6.022869 0.336797 \n",
+ "min 0.000000 0.000000 2.000000 0.000000 \n",
+ "25% 0.000000 0.000000 8.000000 0.000000 \n",
+ "50% 0.000000 0.000000 11.000000 0.000000 \n",
+ "75% 0.000000 1.000000 16.000000 0.000000 \n",
+ "max 18.000000 5.000000 72.000000 4.000000 \n",
+ "\n",
+ " revol_bal ... issue_d_Mar-2019 pymnt_plan_n \\\n",
+ "count 68817.000000 ... 68817.000000 68817.0 \n",
+ "mean 17604.142828 ... 0.177238 1.0 \n",
+ "std 21835.880400 ... 0.381873 0.0 \n",
+ "min 0.000000 ... 0.000000 1.0 \n",
+ "25% 6293.000000 ... 0.000000 1.0 \n",
+ "50% 12068.000000 ... 0.000000 1.0 \n",
+ "75% 21735.000000 ... 0.000000 1.0 \n",
+ "max 587191.000000 ... 1.000000 1.0 \n",
+ "\n",
+ " initial_list_status_f initial_list_status_w next_pymnt_d_Apr-2019 \\\n",
+ "count 68817.000000 68817.000000 68817.000000 \n",
+ "mean 0.123879 0.876121 0.383161 \n",
+ "std 0.329446 0.329446 0.486161 \n",
+ "min 0.000000 0.000000 0.000000 \n",
+ "25% 0.000000 1.000000 0.000000 \n",
+ "50% 0.000000 1.000000 0.000000 \n",
+ "75% 0.000000 1.000000 1.000000 \n",
+ "max 1.000000 1.000000 1.000000 \n",
+ "\n",
+ " next_pymnt_d_May-2019 application_type_Individual \\\n",
+ "count 68817.000000 68817.000000 \n",
+ "mean 0.616839 0.860340 \n",
+ "std 0.486161 0.346637 \n",
+ "min 0.000000 0.000000 \n",
+ "25% 0.000000 1.000000 \n",
+ "50% 1.000000 1.000000 \n",
+ "75% 1.000000 1.000000 \n",
+ "max 1.000000 1.000000 \n",
+ "\n",
+ " application_type_Joint App hardship_flag_N debt_settlement_flag_N \n",
+ "count 68817.000000 68817.0 68817.0 \n",
+ "mean 0.139660 1.0 1.0 \n",
+ "std 0.346637 0.0 0.0 \n",
+ "min 0.000000 1.0 1.0 \n",
+ "25% 0.000000 1.0 1.0 \n",
+ "50% 0.000000 1.0 1.0 \n",
+ "75% 0.000000 1.0 1.0 \n",
+ "max 1.000000 1.0 1.0 \n",
+ "\n",
+ "[8 rows x 95 columns]"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "low_risk 68470\n",
+ "high_risk 347\n",
+ "Name: loan_status, dtype: int64"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Check the balance of our target values\n",
+ "y.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(51612, 95)"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.model_selection import train_test_split\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)\n",
+ "X_train.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Ensemble Learners\n",
+ "\n",
+ "In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:\n",
+ "\n",
+ "1. Train the model using the training data. \n",
+ "2. Calculate the balanced accuracy score from sklearn.metrics.\n",
+ "3. Print the confusion matrix from sklearn.metrics.\n",
+ "4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.\n",
+ "5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score\n",
+ "\n",
+ "Note: Use a random state of 1 for each algorithm to ensure consistency between tests"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Balanced Random Forest Classifier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "BalancedRandomForestClassifier(random_state=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
+ ],
+ "text/plain": [
+ "BalancedRandomForestClassifier(random_state=1)"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Resample the training data with the BalancedRandomForestClassifier\n",
+ "from imblearn.ensemble import BalancedRandomForestClassifier\n",
+ "brfc_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)\n",
+ "brfc_model.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.7877672625306695"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Calculated the balanced accuracy score\n",
+ "from sklearn.metrics import balanced_accuracy_score\n",
+ "y_pred = brfc_model.predict(X_test)\n",
+ "balanced_accuracy_score(y_test, y_pred)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[ 58, 29],\n",
+ " [ 1560, 15558]])"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Display the confusion matrix\n",
+ "from sklearn.metrics import confusion_matrix\n",
+ "confusion_matrix(y_test, y_pred)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " pre rec spe f1 geo iba sup\n",
+ "\n",
+ " high_risk 0.04 0.67 0.91 0.07 0.78 0.59 87\n",
+ " low_risk 1.00 0.91 0.67 0.95 0.78 0.62 17118\n",
+ "\n",
+ "avg / total 0.99 0.91 0.67 0.95 0.78 0.62 17205\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Print the imbalanced classification report for balanced random forest classifier \n",
+ "from imblearn.metrics import classification_report_imbalanced\n",
+ "print(classification_report_imbalanced(y_test, y_pred))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "total_rec_prncp 0.073767\n",
+ "total_rec_int 0.063903\n",
+ "total_pymnt_inv 0.060733\n",
+ "total_pymnt 0.058112\n",
+ "last_pymnt_amnt 0.049518\n",
+ " ... \n",
+ "acc_now_delinq 0.000000\n",
+ "delinq_amnt 0.000000\n",
+ "chargeoff_within_12_mths 0.000000\n",
+ "hardship_flag_N 0.000000\n",
+ "debt_settlement_flag_N 0.000000\n",
+ "Length: 95, dtype: float64\n"
+ ]
+ }
+ ],
+ "source": [
+ "# List the features sorted in descending order by feature importance\n",
+ "priority_features = pd.Series(data=brfc_model.feature_importances_, index=X.columns)\n",
+ "priority_features.sort_values(ascending=False, inplace=True)\n",
+ "print(priority_features)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Easy Ensemble AdaBoost Classifier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "EasyEnsembleClassifier(n_estimators=100, random_state=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
+ ],
+ "text/plain": [
+ "EasyEnsembleClassifier(n_estimators=100, random_state=1)"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Train the EasyEnsembleClassifier\n",
+ "from imblearn.ensemble import EasyEnsembleClassifier\n",
+ "eec_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)\n",
+ "eec_model.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.925427358175101"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Calculated the balanced accuracy score\n",
+ "y_pred = eec_model.predict(X_test)\n",
+ "balanced_accuracy_score(y_test, y_pred)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[ 79, 8],\n",
+ " [ 979, 16139]])"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Display the confusion matrix\n",
+ "confusion_matrix(y_test, y_pred)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " pre rec spe f1 geo iba sup\n",
+ "\n",
+ " high_risk 0.07 0.91 0.94 0.14 0.93 0.85 87\n",
+ " low_risk 1.00 0.94 0.91 0.97 0.93 0.86 17118\n",
+ "\n",
+ "avg / total 0.99 0.94 0.91 0.97 0.93 0.86 17205\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Print the imbalanced classification report for easy ensemble AdaBoost classifier \n",
+ "print(classification_report_imbalanced(y_test, y_pred))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "mlenv",
+ "language": "python",
+ "name": "mlenv"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}