predictive.py

# -*- coding: utf-8 -*-
"""Predictive.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/115JGRcjlVg3Xd0R5TZH-2KeyiZeNZFu3

# Predictive Analysis

In this notebook, I will use the [dataset](https://www.kaggle.com/mashlyn/online-retail-ii-uci) to build an unsupervised machine learning model which allows one to estimate if a given customer will buy something again from the online shop in the next quarter.

<a id="importing-relevant-python-packages"></a>
## Importing Relevant Python Packages
"""

# importing necessary Python libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
#import plotly.offline as pyoff
import plotly.graph_objs as go
#import plotly.figure_factory as ff

# avoid displaying warnings
import warnings
warnings.filterwarnings("ignore")

#import machine learning related libraries
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV, cross_validate
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.cluster import KMeans
import xgboost as xgb
import time

"""<a id='importing-relevant-data'></a>
## Importing Relevant Data
"""

# Loading the data
from google.colab import files
uploaded = files.upload()

import io

df_data = pd.read_csv(io.BytesIO(uploaded['Deliv Data.csv']))

df_data.shape

df_data.head(5)

"""<a id='data-engineering'></a>
## Data Engineering

In this section, we will explore the given dataset to answer some general questions about the dataset. This will lead to cause us to introduce other features into the dataset which will help us to build a machine learning model that will help us to answer our main question.

We can check information about the dataframe with the `info` method.
"""

df_data.info()

"""From the output of the `info` method, we can see that Non of the data point have missing values. Also we may want to change the data type of the objects in the __Date Processed__ column to proper date objects with the `to_datetime` method.

Let check if there is missing values in each column of the dataframe.
"""

df_data.isnull().sum()

df_data = df_data.dropna()
df_data.info()

"""Next, we update the dataframe `df_data` by converting the date field, __Date Processed__ to _datetime_ object. One can use the `to_datetime` method to achieve this."""

df_data['Date Processed'] = pd.to_datetime(df_data['Date Processed'])
df_data.head()

pd.DataFrame(df_data['Date Processed'].describe())

"""From the above, we see that the Delivery made by customers was from 02-01-2023 to 30-03-2024

In the subsequent cells, we will answer some questions we would like to know from the given dataset.

<a id='exp-qxn-1'></a>
### Exploration
> How many customers are there in the dataset and what is their Location?
"""

print('From the dataset, MH MART Limited has {} customers from {} different Locations in the Country.'.format(
    len(df_data['Ship To Party'].unique()), len(df_data['Location'].unique())))


"""<a id='exp-qxn-2'></a>

> What are the Locations that are most represented in the dataset?
"""

LOC_df = df_data.groupby(['Ship To Party', 'Location']).size().reset_index(name='Count')
LOC_df = LOC_df.groupby('Location')['Ship To Party'].nunique().reset_index(name='Unique Customers')
LOC_df = LOC_df.sort_values(by='Unique Customers', ascending=False)
total_customers = LOC_df['Unique Customers'].sum()
LOC_df['Percentage'] = (LOC_df['Unique Customers'] / total_customers) * 100
LOC_df['Percentage'] = LOC_df['Percentage'].round(2)  # Round to 2 decimal places
top_locations = LOC_df.head(20)
print(top_locations)

"""The output above, shows the top $20$ LOCATIONS of MH MART LIMITED  Customers across the Country in the past year till today"""

Branch_df = df_data.groupby(['Ship To Party', 'Branch']).size().reset_index(name='Count')
Branch_df = Branch_df.groupby('Branch')['Ship To Party'].nunique().reset_index(name='No of Customers')
Branch_df = Branch_df.sort_values(by='No of Customers', ascending=False)
total_customers = Branch_df['No of Customers'].sum()
Branch_df['Percentage'] = (Branch_df['No of Customers'] / total_customers) * 100
Branch_df['Percentage'] = Branch_df['Percentage'].round(2)  # Round to 2 decimal places
Branch = Branch_df
print(Branch)


"""<a id='exp-qxn-3'></a>
### Exploration
> Calculate the  total NPS made in each month and what is the percentage NPS based on Branch?
"""

df_data['InvoiceYearMonth'] = df_data['Date Processed'].map(lambda date: 100*date.year + date.month)
df_data.head()

# Convert 'NPS Value' to numeric before calculations
df_data['NPS Value'] = pd.to_numeric(df_data['NPS Value'], errors='coerce')
# Calculate the total NPS made in each month
nps_by_month = df_data.groupby('InvoiceYearMonth')['NPS Value'].sum().reset_index()
print(nps_by_month)
# Calculate the percentage NPS based on Branch
branch_nps = df_data.groupby('Branch')['NPS Value'].sum().reset_index()
total_nps = branch_nps['NPS Value'].sum()
branch_nps['Percentage NPS'] = (branch_nps['NPS Value'] / total_nps) * 100


# Round the percentage to a whole number
branch_nps['Percentage NPS'] = branch_nps['Percentage NPS'].round(2).astype(int)

# Remove South Branch Waters and North Branch Waters
branch_nps = branch_nps[~branch_nps['Branch'].isin(['South Branch Waters', 'North Branch Waters'])]

# Sort the table in ascending order
branch_nps = branch_nps.sort_values(by='Percentage NPS')

# Display the table
print(branch_nps)

# Create a donut chart with fine color fills
fig = px.pie(branch_nps, values='Percentage NPS', names='Branch', hole=0.6,
             color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(title_text='Percentage NPS by Branch', title_x=0.5)
fig.show()

# Convert 'Order quantity' to numeric before calculations
df_data['Quantity (Cases)'] = pd.to_numeric(df_data['Quantity (Cases)'], errors='coerce')

# Calculate the total order quatity made in each month
Order_quantity_by_month = df_data.groupby('InvoiceYearMonth')['Quantity (Cases)'].sum().reset_index()
# Display the table
print(Order_quantity_by_month)
# Calculate the percentage Order quantity  based on Branch
branch_quatity = df_data.groupby('Branch')['Quantity (Cases)'].sum().reset_index()
total_quantity = branch_quatity['Quantity (Cases)'].sum()
branch_quatity['Percentage Quantity'] = (branch_quatity['Quantity (Cases)'] / total_quantity) * 100


# Round the percentage to a whole number
branch_quatity['Percentage Quantity'] = branch_quatity['Percentage Quantity'].round(2).astype(int)

# Remove South Branch Waters and North Branch Waters
branch_quatity = branch_quatity[~branch_quatity['Branch'].isin(['South Branch Waters', 'North Branch Waters'])]

# Sort the table in ascending order
branch_quatity = branch_quatity.sort_values(by='Percentage Quantity')

# Display the table
print(branch_quatity)

# Create a donut chart with fine color fills
fig = px.pie(branch_quatity, values='Percentage Quantity', names='Branch', hole=0.6,
             color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(title_text='Percentage Order Quantity by Branch', title_x=0.5)
fig.show()

# @title Vehicle Capacity (Tons)

from matplotlib import pyplot as plt
import seaborn as sns
filtered_data = df_data[(df_data['Vehicle Capacity (Tons)'] != "0") & (df_data['Vehicle Capacity (Tons)'] != "1 Tons")]
filtered_data.groupby('Vehicle Capacity (Tons)').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

df_data['NPS Value'] = pd.to_numeric(df_data['NPS Value'], errors='coerce')
ctm_NPS = df_data.groupby('InvoiceYearMonth')['NPS Value'].sum().reset_index()
ctm_NPS['NPS Value'] = ctm_NPS['NPS Value'].round(0).astype(int)
print(ctm_NPS)

pd.DataFrame(ctm_NPS['NPS Value'].describe())

# plot line plot
line_fig = px.line(ctm_NPS,
                   x = "InvoiceYearMonth",
                   y = "NPS Value",
                   title = "Montly NPS from JAN, 2023 to MAR, 2024",
                   template= "plotly_dark"
                  )

line_fig.update_layout(title_x=0.5,
                       showlegend=False,
                       xaxis={"type": "category"},
                       xaxis_title="Invoice Year-Month",
                       yaxis_title="Monthly NPS"
                      )

line_fig.show(config={'displaylogo': False})

Cus_NPS_df = df_data.groupby(['Customer Name'])['NPS Value'].sum().reset_index().sort_values(by='NPS Value', ascending=False)

total_NPS = Cus_NPS_df['NPS Value'].sum()
Cus_NPS_df['Percentage'] = np.round(Cus_NPS_df['NPS Value'] / total_NPS * 100, 2)

Cus_NPS_df.head(10)

"""From the output above, the top $10$ Customers with respect to NPS generated"""

def format_currency(n):
    return f"${n:,.2f}"

Cus_NPS_df = df_data.groupby(['Customer Name'])['NPS Value'].sum().reset_index().sort_values(by='NPS Value', ascending=False)

total_NPS = Cus_NPS_df['NPS Value'].sum()

Cus_NPS_df['Percentage'] = np.round(Cus_NPS_df['NPS Value'] / total_NPS * 100, 2)

Cus_NPS_df['NPS Value'] = Cus_NPS_df['NPS Value'].map(format_currency)

Cus_NPS_df.head(10)

"""<a id='predicting-customer-purchase'></a>
## Predicting Customer Purchase

The goal of this section is to come up with a model using the given dataframe `df_data`, to estimate when a given customer is to buy or next purchasing date from MH MART LIMITED

The dataframe is split into two.

* The first sub-dataframe assigned to the Python variable `ctm_bhvr_dt` contains purchases made by customers from `02-01-2023` to `31-12-2023`. This dataframe will be used to study the behavioural purchases of the customers.

* The second sub-dataframe assigned to the Python variable `ctm_next_quarter` will be used to study the behavioural purchases of the customers in the next quarter. That is, from `02-01-2024` to `31-03-2024`.
"""

df_data.head()

# Filter data for the current quarter (Jan 1, 2023, to Dec 31, 2023)
ctm_bhvr_dt = df_data[(df_data['Date Processed'] < pd.Timestamp(2024, 1, 1)) &
                      (df_data['Date Processed'] >= pd.Timestamp(2023, 1, 1))].reset_index(drop=True)

# Filter data for the next quarter (Jan 1, 2024, to Mar 30, 2024)
ctm_next_quarter = df_data[(df_data['Date Processed'] < pd.Timestamp(2024, 3, 30)) &
                           (df_data['Date Processed'] >= pd.Timestamp(2024, 1, 1))].reset_index(drop=True)

# Get the distinct customers in the dataframe ctm_bhvr_dt
ctm_dt = pd.DataFrame(ctm_bhvr_dt['Ship To Party'].unique())

# Rename the column to CustomerID.
ctm_dt.columns = ['Ship To Code']

ctm_dt.head()

"""first purchase made by each customer in the next quarter."""

# Create a dataframe with Ship To Party and customers first purchase
# date in ctm_next_quarter
ctm_1st_purchase_in_next_quarter = ctm_next_quarter.groupby('Ship To Party')['Date Processed'].min().reset_index()
ctm_1st_purchase_in_next_quarter.columns = ['Ship To Code','MinPurchaseDate']
ctm_1st_purchase_in_next_quarter.head()

"""Last purchase made by each customer in the dataframe `ctm_bhvr_dt`."""

ctm_last_purchase_bhvr_dt = ctm_bhvr_dt.groupby('Ship To Party')['Date Processed'].max().reset_index()
ctm_last_purchase_bhvr_dt.columns = ['Ship To Code','MaxPurchaseDate']
ctm_last_purchase_bhvr_dt.head()

"""Let's merge the two dataframes `ctm_last_purchase_bhvr_dt` and `ctm_1st_purchase_in_next_quarter`."""

# Merge two dataframes ctm_last_purchase_bhvr_dt and ctm_1st_purchase_in_next_quarter
ctm_purchase_dates = pd.merge(ctm_last_purchase_bhvr_dt, ctm_1st_purchase_in_next_quarter, on='Ship To Code',
                              how='left')
ctm_purchase_dates.head()

"""Let's calculate the time difference in days between customer's last purchase in the dataframe `ctm_last_purchase_bhvr_dt` and the first purchase in the dataframe `ctm_1st_purchase_in_next_quarter`."""

ctm_purchase_dates['NextPurchaseDay'] = (ctm_purchase_dates['MinPurchaseDate'] - ctm_purchase_dates['MaxPurchaseDate']).dt.days

ctm_purchase_dates.head()

# merge with ctm_dt
ctm_dt = pd.merge(ctm_dt, ctm_purchase_dates[['Ship To Code','NextPurchaseDay']], on='Ship To Code', how='left')
ctm_dt.head()

"""Update the dataframe `ctm_dt` by filling all missing values with $9999$."""

missing_values = ctm_dt.isnull().sum()
print(missing_values)
total_missing_values = missing_values.sum()
if total_missing_values > 0:
    print(f'There are {total_missing_values} missing values in the DataFrame.')
else:
    print('There are no missing values in the DataFrame.')

ctm_dt = ctm_dt.fillna(9999)
ctm_dt.head()

"""Next, we will define some features and add them to the dataframe `ctm_dt` to build our machine learning model. We will use the Recency - Frequency - Monetary Value segmentation method. That is, we will put the customers into groups based on the following:

* __Recency__: Customers purchase behaviour based on their most recent purchase date and how many days they have been inactive since their last purchase.

* __Frequency__: Customers purchase behaviour based on the number of times they buy from MH MART LIMITED
* __Monetary Value__/__Revenue__: Customers purchase behaviour based the revenue they generate.

After we will apply *K-means* clustering to assign customers a score to each of the features.

<a id='recency'></a>
#### Recency

Let's find the most recent purchase date of each customer and see how many days they have been inactive. Afterwards, we can apply *K-means* clustering to assign customers a recency score.
"""

ctm_max_purchase = ctm_bhvr_dt.groupby('Ship To Party')['Date Processed'].max().reset_index()
ctm_max_purchase.columns = ['Ship To Code','MaxPurchaseDate']
ctm_max_purchase.head()

# Find the recency in days
ctm_max_purchase['Recency'] = (ctm_max_purchase['MaxPurchaseDate'].max() - ctm_max_purchase['MaxPurchaseDate']).dt.days

# Merge the dataframes ctm_dt and ctm_max_purchase[['CustomerID', 'Recency']] on the CustomerID column.
ctm_dt = pd.merge(ctm_dt, ctm_max_purchase[['Ship To Code', 'Recency']], on='Ship To Code')
ctm_dt.head()

pd.DataFrame(ctm_dt.Recency.describe())

"""The mean _Recency_ is approximately $25$ days whiles the median is $7$ days."""

# plot histogram
hist_fig = px.histogram(ctm_dt,
                        x="Recency",
                        title="Customers Recency in Days",
                        template= "plotly_white"
                       )

hist_fig.update_layout(title_x=0.5,
                       xaxis_title="Recency in groups of 50 days",
                       yaxis_title="Number of Customers"
                      )

hist_fig.show(config={'displaylogo': True})

"""Next we will apply _K-means_ clustering to assign a recency score. However, we need to know how many clusters in order to use the _K-means_ algorithm. We will apply _Elbow Method_ to determine how many clusters we will need. The _Elbow Method_ simply tells the optimal cluster number for optimal inertia."""

my_dict={}
ctm_recency = ctm_dt[['Recency']]
for idx in range(1, 10):
    kmeans = KMeans(n_clusters=idx, max_iter=1000).fit(ctm_recency)
    ctm_recency["clusters"] = kmeans.labels_
    my_dict[idx] = kmeans.inertia_

line_fig = px.line(x=list(my_dict.keys()),
                   y=list(my_dict.values()),
                   template="plotly_white"
                  )

line_fig.update_layout(title_x=0,
                       xaxis_title="Number of cluster",
                       yaxis_title=""
                      )

line_fig.show(config={'displaylogo': True})

"""From the Figure above, $9$ seem to be the optimal one."""

number_of_clusters = 9

"""Let's build a $9$ clusters for recency and add it to dataframe, `ctm_dt`."""

kmeans = KMeans(n_clusters=number_of_clusters)
kmeans.fit(ctm_dt[['Recency']])
ctm_dt['RecencyCluster'] = kmeans.predict(ctm_dt[['Recency']])
ctm_dt.head()

def order_cluster(df, target_field_name, cluster_field_name, ascending):
    """
    INPUT:
        - df                  - pandas DataFrame
        - target_field_name   - str - A column in the pandas DataFrame df
        - cluster_field_name  - str - Expected to be a column in the pandas DataFrame df
        - ascending           - Boolean

    OUTPUT:
        - df_final            - pandas DataFrame with target_field_name and cluster_field_name as columns

    """
    # Add the string "new_" to cluster_field_name
    new_cluster_field_name = "new_" + cluster_field_name

    # Create a new dataframe by grouping the input dataframe by cluster_field_name and extract target_field_name
    # and find the mean
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()

    # Sort the new dataframe df_new, by target_field_name in descending order
    df_new = df_new.sort_values(by=target_field_name, ascending=ascending).reset_index(drop=True)

    # Create a new column in df_new with column name index and assign it values to df_new.index
    df_new["index"] = df_new.index

    # Create a new dataframe by merging input dataframe df and part of the columns of df_new based on
    # cluster_field_name
    df_final = pd.merge(df, df_new[[cluster_field_name, "index"]], on=cluster_field_name)

    # Update the dataframe df_final by deleting the column cluster_field_name
    df_final = df_final.drop([cluster_field_name], axis=1)

    # Rename the column index to cluster_field_name
    df_final = df_final.rename(columns={"index": cluster_field_name})

    return df_final

ctm_dt = order_cluster(ctm_dt, 'Recency', 'RecencyCluster', False)
ctm_dt.head()

#print cluster characteristics
ctm_dt.groupby('RecencyCluster')['Recency'].describe()

"""Observe from the above that, $8$ covers the most recent customers whereas $0,1,2,3$ has the most inactive customers."""


"""<a id='frequency'></a>
#### Frequency

Next, we will find customers purchase behaviour based on the number of times they buy from Nestle Nigeria Plc. That is, the total number of orders by each customer.
"""

#get order counts for each user and create a dataframe with it
ctm_frequency = df_data.groupby('Ship To Party')['Date Processed'].count().reset_index()
ctm_frequency.columns = ['Ship To Code','Frequency']

#add this data to our main ctm_dt
ctm_dt = pd.merge(ctm_dt, ctm_frequency, on='Ship To Code')

ctm_dt.head()

pd.DataFrame(ctm_dt.Frequency.describe())

# plot histogram
hist_fig = px.histogram(x=ctm_dt.query('Frequency < 700')['Frequency'],
                        title="Customers with Purchase Frequency less than 700",
                        template= "plotly_white"
                       )

hist_fig.update_layout(title_x=0.5,
                       xaxis_title="Customer Frequency Purchase in groups of 100",
                       yaxis_title="Number of Customers"
                      )

hist_fig.show(config={'displaylogo': True})

kmeans = KMeans(n_clusters=number_of_clusters)
kmeans.fit(ctm_dt[['Frequency']])
ctm_dt['FrequencyCluster'] = kmeans.predict(ctm_dt[['Frequency']])

ctm_dt = order_cluster(ctm_dt, 'Frequency', 'FrequencyCluster', False)
ctm_dt.head()

#see details of each cluster
ctm_dt.groupby('FrequencyCluster')['Frequency'].describe()

"""As it was for the case of the Recency, higher frequency number means better customers."""


"""<a id='revenue'></a>
#### Revenue
"""

ctm_revenue = df_data.groupby('Ship To Party')['NPS Value'].sum().reset_index()
ctm_revenue.columns = ['Ship To Code','NPS VALUE']

#merge it with our ctm_dt
ctm_dt = pd.merge(ctm_dt, ctm_revenue, on='Ship To Code')
ctm_dt.head()

#apply clustering
kmeans = KMeans(n_clusters=number_of_clusters)
kmeans.fit(ctm_dt[['NPS VALUE']])
ctm_dt['NPSVALUECluster'] = kmeans.predict(ctm_dt[['NPS VALUE']])

#order the cluster numbers
ctm_dt = order_cluster(ctm_dt, 'NPS VALUE', 'NPSVALUECluster', True)
ctm_dt.head()

#show details of the dataframe
ctm_dt.groupby('NPSVALUECluster')['NPS VALUE'].describe()


"""<a id='overall-score'></a>
#### Overall Score

Finally we sum all the cluster features
"""

#calculate overall score and use mean() to see details
ctm_dt['OverallScore'] = ctm_dt['RecencyCluster'] + ctm_dt['FrequencyCluster'] + ctm_dt['NPSVALUECluster']

# Group by 'OverallScore' and calculate the mean of relevant columns
score_details = ctm_dt.groupby('OverallScore').agg({'Recency': 'mean', 'Frequency': 'mean', 'NPS VALUE': 'mean'})

# Display the details of mean values for each cluster
print(score_details)

ctm_dt['Segment'] = 'Low-Value'
ctm_dt.loc[ctm_dt['OverallScore'] > 10, 'Segment'] = 'Mid-Value'
ctm_dt.loc[ctm_dt['OverallScore'] > 13, 'Segment'] = 'High-Value'

ctm_dt.head()

"""Let us create a copy of the dataframe `ctm_dt` and apply the method `get_dummies` to it so as to convert all categorical column `Segment` to indicator variables."""

#create ctm_class as a copy of ctm_dt before applying get_dummies
ctm_class = ctm_dt.copy()

ctm_class = pd.get_dummies(ctm_class)

ctm_class.head()

"""Since our goal is to estimate whether a customer will make a purchase next, we will create a new column `NextPurchaseDayRange` with values as either $1$ or $0$ defined as follows:  
* If the value is $1$, then it indicates that the customer will buy something in the next quarter, i.e., $7$ days from his or her last purchase.
* The value $0$ indicates that the customer will buy something in more than $7$ days from his or her last purchase.
"""

ctm_class['NextPurchaseDayRange'] = 1  ## less than 7 days
ctm_class.loc[ctm_class.NextPurchaseDay>7,'NextPurchaseDayRange'] = 0 # more than 7 days
ctm_class.head()

"""Finally in this section, let's see the correlation between our features and label. We can achieve this by applying the `corr` method to the dataframe `ctm_dt`."""

corr_matrix = ctm_class[ctm_class.columns].corr()
corr_df = pd.DataFrame(corr_matrix.min())
corr_df.columns = ['MinCorrelationCoeff']
corr_df['MaxCorrelationCoeff'] = corr_matrix[corr_matrix < 1].max()
corr_df

"""From the output above, we observe that __NPS VALUE__ has the highest positive correlation of $0.96$ with __RecencyCluster__ and __Segment_Low-Value__ has the highest negative of $-0.95$.

We can get a good visualisation of the coefficient matrix below.
"""

plt.figure(figsize = (15, 15))
sns.heatmap(corr_matrix, annot = True, linewidths=0.2, fmt=".2f");


"""<a id='building-machine-learning-models'></a>
## Building Machine Learning Models
"""

ctm_class = ctm_class.drop('NextPurchaseDay', axis=1)

X, y = ctm_class.drop('NextPurchaseDayRange', axis=1), ctm_class.NextPurchaseDayRange
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=None, shuffle=True)

# Create an array of models
models = []
models.append(("LogisticRegression", LogisticRegression()))
models.append(("GaussianNB", GaussianNB()))
models.append(("RandomForestClassifier", RandomForestClassifier()))
models.append(("SVC", SVC()))
models.append(("DecisionTreeClassifier", DecisionTreeClassifier()))
models.append(("xgb.XGBClassifier", xgb.XGBClassifier(eval_metric='mlogloss')))
models.append(("KNeighborsClassifier", KNeighborsClassifier()))

from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import pandas as pd
import numpy as np
import time

# Create an array of models
models = []
models.append(("LogisticRegression", LogisticRegression()))
models.append(("GaussianNB", GaussianNB()))
models.append(("RandomForestClassifier", RandomForestClassifier()))
models.append(("SVC", SVC()))
models.append(("DecisionTreeClassifier", DecisionTreeClassifier()))
models.append(("xgb.XGBClassifier", xgb.XGBClassifier(eval_metric='mlogloss')))
models.append(("KNeighborsClassifier", KNeighborsClassifier()))

# A dictionary for all the distinct models and their respective metrics
model_scores_dict = {'model_name': [],
                     'accuracy': [],
                     'f1_score': [],
                     'recall': [],
                     'precision': [],
                     'time': []
                    }

# For each model name and model in models
for model_name, model in models:

    # Add model_name to model_scores_dict
    model_scores_dict['model_name'].append(model_name)

    kfold = KFold(n_splits=2, random_state=24, shuffle=True)
    start = time.time()

    # Perform cross-validation and get scores
    cv_accuracy = np.mean(cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy'))
    cv_f1_score = np.mean(cross_val_score(model, X_train, y_train, cv=kfold, scoring='f1_macro'))
    cv_recall = np.mean(cross_val_score(model, X_train, y_train, cv=kfold, scoring='recall_macro'))
    cv_precision = np.mean(cross_val_score(model, X_train, y_train, cv=kfold, scoring='precision_macro'))

    # Update model_scores_dict with scores for model_name
    model_scores_dict['accuracy'].append(cv_accuracy)
    model_scores_dict['f1_score'].append(cv_f1_score)
    model_scores_dict['recall'].append(cv_recall)
    model_scores_dict['precision'].append(cv_precision)
    model_scores_dict['time'].append(time.time() - start)

# Create DataFrame from model_scores_dict
model_score_df = pd.DataFrame(model_scores_dict).set_index("model_name")

# Sort DataFrame by accuracy, f1_score, and time
sorted_model_score_df = model_score_df.sort_values(by=["accuracy", "f1_score", "time"], ascending=False)

# Display sorted DataFrame
sorted_model_score_df

"""Let's see how we could improve the existing model `XGB` by finding suitable parameters via the process of hyperparameter tuning using `GridSearchCV`. We will check if the improved `XGB Classifier` model outperforms the `LogisticRegression` model."""

parameter = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,5,2)
    }

p_grid_search = GridSearchCV(estimator = xgb.XGBClassifier(eval_metric='mlogloss'),
                             param_grid = parameter,
                             scoring='accuracy',
                             n_jobs=-1,
                             #iid=False,
                             cv=2
                            )

p_grid_search.fit(X_train, y_train)

p_grid_search.best_params_, p_grid_search.best_score_

refined_xgb_model = xgb.XGBClassifier(eval_metric='logloss',
                                      max_depth=list(p_grid_search.best_params_.values())[0]-1,
                                      min_child_weight=list(p_grid_search.best_params_.values())[-1]+4
                                     ).fit(X_train, y_train)

print('Accuracy of XGB classifier on training set: {:.2f}'.format(refined_xgb_model.score(X_train, y_train)))
print('Accuracy of XGB classifier on test set: {:.2f}'.format(refined_xgb_model.score(X_test[X_train.columns], y_test)))

"""As we can see, the improved `XGB classifier` model is accurate than the `LogisticRegression` model by $0.1$.

Let us go ahead and predict with these two models.
"""

ref_xgb_pred_y = refined_xgb_model.predict(X_test)
ref_xgb_pred_y

log_reg_pred_y = LogisticRegression().fit(X_train, y_train).predict(X_test)
log_reg_pred_y

"""Let compute the confusion matrices of these two models with the user-defined function `get_confusion_matrix` defined below."""

def get_confusion_matrix(y_test, y_pred):
    """
    Displays the confusion matrix of the input numpy arrays y_test and y_pred.

    INPUT:
        y_test - A numpy array
        y_pred - A numpy array

    OUTPUT:
        NoneType
    """

    data = {'y_Actual': y_test, 'y_Predicted': y_pred}

    df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])
    conf_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'],
                              rownames=['Actual'],
                              colnames=['Predicted'])

    sns.heatmap(conf_matrix, annot=True, fmt = "d", cmap="Spectral")
    plt.show()

"""Let's get the confusion matrix for `y_test` and `ref_xgb_pred_y`, i.e., for the improved `XGB classifier` model and `y_test` and `log_reg_pred_y`, i.e., for the `LogisticRegression` model."""

get_confusion_matrix(np.array(y_test), ref_xgb_pred_y)

get_confusion_matrix(np.array(y_test), log_reg_pred_y)

"""Let's check if the refined `XGB Classifier` outperforms the `LogisticRegression` for the other metrics."""

# A dictionary of model names with the various metrics
ref_xgb_log_reg_dict = {"model_name" : ["xgb.XGBClassifier", "LogisticRegression"],
                        "accuracy"   : [accuracy_score(y_test, ref_xgb_pred_y), accuracy_score(y_test, log_reg_pred_y)],
                        "f1_score"   : [f1_score(y_test, ref_xgb_pred_y), f1_score(y_test, log_reg_pred_y)],
                        "recall"     : [recall_score(y_test, ref_xgb_pred_y), recall_score(y_test, log_reg_pred_y)],
                        "precision"  : [precision_score(y_test, ref_xgb_pred_y), precision_score(y_test, log_reg_pred_y)]
                       }

# Create a dataframe with ref_xgb_log_reg_dict
ref_xgb_log_reg_df = pd.DataFrame(ref_xgb_log_reg_dict).set_index("model_name")

# Order the dataframe ref_xgb_log_reg_df by the metric values in increasing order
ref_xgb_log_reg_df.sort_values(by=["accuracy", "f1_score", "recall", "precision"], ascending=False)

"""It is obvious from the output in the cell above that for each metric, ${\rm accuracy}$, $F_{1}-{\rm score}$, ${\rm recall}$, and ${\rm precision}$, the improved or refined `XGB classifier` model out performs the `LogisticRegression` model. Thus, we will choose the refined `XGB classifier` model over the `LogisticRegression` model."""