diff --git a/CreditCard_Customer_Segmentation.html b/CreditCard_Customer_Segmentation.html index c0364d3..2960ea9 100644 --- a/CreditCard_Customer_Segmentation.html +++ b/CreditCard_Customer_Segmentation.html @@ -3,7 +3,7 @@
-AllLife Bank (ALB) wants to focus on its credit card customer base in the next financial year. ALB has been advised by its marketing research team (MRT), that the market penetration can be improved. Based on this premise, the Marketing team proposes to run personalized campaigns to target new customers as well as upsell to existing customers. Another insight from the MRT was that customers perceive the support services of ALB poorly. Considering these, the Operations team wants to improve the service delivery model to ensure that customer queries are resolved faster. The head of Marketing and the head of Delivery decide to reach out to the Data Science team for help.
--AI Generated image.
+A financial institution (hereafter referred to as "FI Bank") aims to prioritize its credit card customer base in the upcoming financial year. The marketing research team (hereafter referred to as "MRT") at FI Bank has identified opportunities for enhancing market penetration. In light of this finding, MRT proposes personalized campaigns to attract new customers and upsell to existing ones. Additionally, MRT indicates that customers perceive FI Bank's support services poorly. Consequently, the Operations team intends to enhance the service delivery model to expedite resolution of customer queries. Recognizing these challenges, the heads of Marketing and Operations are requesting assistance.
The data provided is of various customers of ALB and their financial attributes like credit limit, the total number of credit cards the customer has, and different channels through which customers have contacted the bank for any queries (including visiting the bank, online and through a call center).
+The data provided is of various customers of FI Bank and their financial attributes like credit limit, the total number of credit cards the customer has, and different channels through which customers have contacted the bank for any queries (including visiting the bank, online and through a call center).
Data Dictionary
#yellowbrick is a machine learning visualization library. We will use it to
@@ -14827,22 +14831,40 @@ Importing Libraries
Requirement already satisfied: yellowbrick in /usr/local/lib/python3.10/dist-packages (1.5) Requirement already satisfied: matplotlib!=3.0.0,>=2.0.2 in /usr/local/lib/python3.10/dist-packages (from yellowbrick) (3.7.1) -Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from yellowbrick) (1.10.1) +Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from yellowbrick) (1.11.4) Requirement already satisfied: scikit-learn>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from yellowbrick) (1.2.2) -Requirement already satisfied: numpy>=1.16.0 in /usr/local/lib/python3.10/dist-packages (from yellowbrick) (1.23.5) -Requirement already satisfied: cycler>=0.10.0 in /usr/local/lib/python3.10/dist-packages (from yellowbrick) (0.11.0) -Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.1.0) -Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (4.42.1) -Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.4.4) -Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (23.1) +Requirement already satisfied: numpy>=1.16.0 in /usr/local/lib/python3.10/dist-packages (from yellowbrick) (1.25.2) +Requirement already satisfied: cycler>=0.10.0 in /usr/local/lib/python3.10/dist-packages (from yellowbrick) (0.12.1) +Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.2.0) +Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (4.49.0) +Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.4.5) +Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (24.0) Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (9.4.0) -Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (3.1.1) +Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (3.1.2) Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (2.8.2) Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.0.0->yellowbrick) (1.3.2) -Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.0.0->yellowbrick) (3.2.0) +Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.0.0->yellowbrick) (3.3.0) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.16.0)
# Avoids scroll-in-the-scroll in the entire Notebook
@@ -14872,12 +14894,41 @@ Importing Libraries
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
#import libraries
@@ -14954,9 +15005,27 @@ Importing Libraries
+
+
+
+
+
#allow access to google drive
@@ -15001,9 +15070,27 @@ Importing Libraries
+
+
+
+
+
#read csv file and load into a panda dataframe
-ALB_orig_df = pd.read_excel('/content/gdrive/MyDrive/Projects/ML/Data/Credit+Card+Customer+Data.xlsx')
+FI_orig_df = pd.read_excel('/content/gdrive/MyDrive/Notebooks/Credit_Card_Segmentation/Data/Credit+Card+Customer+Data.xlsx')
#copy original data frame
-ALB_df = ALB_orig_df.copy()
+FI_df = FI_orig_df.copy()
#Useful functions for this notebook
@@ -15307,7 +15430,7 @@ Functions#redo hca
hca_pca = AgglomerativeClustering(n_clusters=number_clusters, affinity="euclidean", linkage="average")
- hca_pca.fit(ALB_scaled_df)
+ hca_pca.fit(FI_scaled_df)
#predict labels
label = hca_pca.fit_predict(pca_df)
@@ -15376,9 +15499,27 @@ Functions
+
+
+
+
+
#Check number of rows and columns
-ALB_df.shape
+FI_df.shape
#Lets take a look at the dataset characteristics
-ALB_df.info()
+FI_df.info()
#lets examine the head
-ALB_df.head(10)
+#lets examine the head of the dataset
+FI_df.head(20)
#Checking missing values
-ALB_df.isna().sum()
+FI_df.isna().sum()
#Duplicated entries (exact entries or 2 or more rows are exactly the same)
-ALB_df.duplicated().sum()
+FI_df.duplicated().sum()
#Duplicated Customers (Customer Key)?
-ALB_df['Customer Key'].duplicated().sum()
+FI_df['Customer Key'].duplicated().sum()
ALB_dupKey = ALB_df[ALB_df.duplicated(["Customer Key"], False)]
-ALB_dupKey.sort_values(by=['Customer Key'])
+FI_dupKey = FI_df[FI_df.duplicated(["Customer Key"], False)]
+FI_dupKey.sort_values(by=['Customer Key'])
#Drop first two columns
coltodrop = ['Sl_No','Customer Key']
-ALB_df.drop(coltodrop, axis=1,inplace=True)
+FI_df.drop(coltodrop, axis=1,inplace=True)
#verify that the columns were dropped
-ALB_df.info()
+FI_df.info()
#Check "Avg_Credit_Limit"
-ALB_df['Avg_Credit_Limit'].value_counts()
+FI_df['Avg_Credit_Limit'].value_counts()
#Check if there is a negative entry for "Avg_Credit_Limit"
-AvgCreditNeg = ALB_df[ALB_df['Avg_Credit_Limit']<0]
+AvgCreditNeg = FI_df[FI_df['Avg_Credit_Limit']<0]
AvgCreditNeg
#Check "Total_Credit_Cards"
-ALB_df['Total_Credit_Cards'].value_counts()
+FI_df['Total_Credit_Cards'].value_counts()
#Check "Total_visits_bank"
-ALB_df['Total_visits_bank'].value_counts()
+FI_df['Total_visits_bank'].value_counts()
#Check "Total_visits_online"
-ALB_df['Total_visits_online'].value_counts()
+FI_df['Total_visits_online'].value_counts()
#Check "Total_calls_made"
-ALB_df['Total_calls_made'].value_counts()
+FI_df['Total_calls_made'].value_counts()
#General statistics of the data
-ALB_df.describe().T
+FI_df.describe().T
#Analyze "Avg_Credit_Limit"
-histogram_boxplot(ALB_df, "Avg_Credit_Limit")
+histogram_boxplot(FI_df, "Avg_Credit_Limit")
acl_median = ALB_df['Avg_Credit_Limit'].median()
+acl_median = FI_df['Avg_Credit_Limit'].median()
print("Avg_Credit_Limit median = ",acl_median)
@@ -17723,9 +18533,27 @@ Univariate Analysis
+
+
+
+
+
#Analyze "Total_Credit_Cards"
-histogram_boxplot(ALB_df, "Total_Credit_Cards")
+histogram_boxplot(FI_df, "Total_Credit_Cards")
tcc_median = ALB_df['Total_Credit_Cards'].median()
+tcc_median = FI_df['Total_Credit_Cards'].median()
print("Total_Credit_Cards median = ",tcc_median)
@@ -17859,9 +18705,27 @@ Univariate Analysis
+
+
+
+
+
#Analyze "Total_visits_bank"
-histogram_boxplot(ALB_df, "Total_visits_bank")
+histogram_boxplot(FI_df, "Total_visits_bank")
tvb_median = ALB_df['Total_visits_bank'].median()
+tvb_median = FI_df['Total_visits_bank'].median()
print("Total_visits_bank median = ",tvb_median)
@@ -17995,9 +18877,27 @@ Univariate Analysis
+
+
+
+
+
#Analyze "Total_visits_online"
-histogram_boxplot(ALB_df, "Total_visits_online")
+histogram_boxplot(FI_df, "Total_visits_online")
tvo_median = ALB_df['Total_visits_online'].median()
+tvo_median = FI_df['Total_visits_online'].median()
print("Total_visits_online median = ",tvo_median)
@@ -18131,9 +19049,27 @@ Univariate Analysis
+
+
+
+
+
#Analyze "Total_calls_made"
-histogram_boxplot(ALB_df, "Total_calls_made")
+histogram_boxplot(FI_df, "Total_calls_made")
tcm_median = ALB_df['Total_calls_made'].median()
+tcm_median = FI_df['Total_calls_made'].median()
print("Total_calls_made median = ",tcm_median)
@@ -18267,9 +19221,27 @@ Univariate Analysis
+
+
+
+
+
#make subplots of the general distribution of each feature
@@ -18322,10 +19294,10 @@ Univariate Analysiscounter = 0
for ii in range(3):
- sns.ecdfplot(ax=axes[ii][0], x=ALB_df[num_col[counter]])
+ sns.ecdfplot(ax=axes[ii][0], x=FI_df[num_col[counter]])
counter = counter + 1
if counter != 5:
- sns.ecdfplot(ax=axes[ii][1], x=ALB_df[num_col[counter]])
+ sns.ecdfplot(ax=axes[ii][1], x=FI_df[num_col[counter]])
counter = counter + 1
else:
pass
@@ -18354,9 +19326,27 @@ Univariate Analysis
+
+
+
+
+
# let's explore Total_visits_online further
-labeled_barplot(ALB_df, "Total_visits_online", perc=True)
+labeled_barplot(FI_df, "Total_visits_online", perc=True)
# let's explore Total_visits_bank further
-labeled_barplot(ALB_df, "Total_visits_bank", perc=True)
+labeled_barplot(FI_df, "Total_visits_bank", perc=True)
# let's explore Total_calls_made further
-labeled_barplot(ALB_df, "Total_calls_made", perc=True)
+labeled_barplot(FI_df, "Total_calls_made", perc=True)
# let's explore Total_Credit_Cards further
-labeled_barplot(ALB_df, "Total_Credit_Cards", perc=True)
+labeled_barplot(FI_df, "Total_Credit_Cards", perc=True)
plt.figure(figsize=(16, 8))
-sns.heatmap(ALB_df[num_col].corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
+sns.heatmap(FI_df[num_col].corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
Number of Credit Cards vs Contact with the ALB
+Number of Credit Cards vs Customer Contact (Phone and Online) with FI Bank
Type of Contacts vs Other Type of Contacts with ALB
+Type of Contacts vs Other Type of Contacts (Branch visit) with FI Bank
#visualize pair plots
-sns.pairplot(data=ALB_df[num_col], diag_kind="kde")
+sns.pairplot(data=FI_df[num_col], diag_kind="kde")
plt.show()
#variables used for clustering
@@ -19002,9 +20100,27 @@ Data Arrangement for Classi
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -19014,7 +20130,7 @@ Data Arrangement for Classi
- Out[ ]:
+ Out[47]:
@@ -19050,19 +20166,19 @@ Data Arrangement for Classi
-In [ ]:
+In [48]:
# scaling the dataset before clustering
scaler = StandardScaler()
#make a copy of the previously processed data. Keep in mind then:
-#ALB_orig_df is the original data
-#ALB_df is the processed data treated against duplicates, null values, missing values,
+#FI_orig_df is the original data
+#FI_df is the processed data treated against duplicates, null values, missing values,
#dropped features before scaling
-#ALB_scaled is the scaled dataset
+#FI_scaled is the scaled dataset
-ALB_scaled = scaler.fit_transform(ALB_df)
+FI_scaled = scaler.fit_transform(FI_df)
@@ -19086,9 +20202,27 @@ Data Arrangement for Classi
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -19104,11 +20238,11 @@ Data Arrangement for Classi
-In [ ]:
+In [49]:
# creating a dataframe of the scaled columns
-ALB_scaled_df = pd.DataFrame(ALB_scaled, columns=ALB_df.columns)
+FI_scaled_df = pd.DataFrame(FI_scaled, columns=FI_df.columns)
@@ -19132,9 +20266,27 @@ Data Arrangement for Classi
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -19150,11 +20302,11 @@ Data Arrangement for Classi
-In [ ]:
+In [50]:
#The scaled data
-ALB_scaled_df.head(10)
+FI_scaled_df.head(10)
@@ -19178,9 +20330,27 @@ Data Arrangement for Classi
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -19190,13 +20360,13 @@ Data Arrangement for Classi
- Out[ ]:
+ Out[50]:
-
+
+
@@ -19472,12 +20711,12 @@ Data Arrangement for Classi
-In [ ]:
+In [51]:
#Lets apply(zscore) to check the difference with StandardScaler()
-ALB_scaled_z_df =ALB_df.apply(zscore)
-ALB_scaled_z_df.head(10)
+FI_scaled_z_df =FI_df.apply(zscore)
+FI_scaled_z_df.head(10)
@@ -19501,9 +20740,27 @@ Data Arrangement for Classi
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -19513,13 +20770,13 @@ Data Arrangement for Classi
- Out[ ]:
+ Out[51]:
-
+
+
@@ -19797,10 +21123,10 @@ Data Arrangement for Classi
-In [ ]:
+In [52]:
-ALB_scaled_df.describe().T
+FI_scaled_df.describe().T
@@ -19824,9 +21150,27 @@ Data Arrangement for Classi
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -19836,13 +21180,13 @@ Data Arrangement for Classi
- Out[ ]:
+ Out[52]:
-In [ ]:
+In [53]:
clusters = range(1, 10)
@@ -20150,13 +21563,13 @@ Elbow curve methodfor k in clusters:
model = KMeans(n_clusters=k)
- model.fit(ALB_scaled_df)
- prediction = model.predict(ALB_scaled_df)
+ model.fit(FI_scaled_df)
+ prediction = model.predict(FI_scaled_df)
distortion = (
sum(
- np.min(cdist(ALB_scaled_df, model.cluster_centers_, "euclidean"), axis=1)
+ np.min(cdist(FI_scaled_df, model.cluster_centers_, "euclidean"), axis=1)
)
- / ALB_scaled_df.shape[0]
+ / FI_scaled_df.shape[0]
)
meanDistortions.append(distortion)
@@ -20191,9 +21604,27 @@ Elbow curve method
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -20211,11 +21642,11 @@ Elbow curve methodElbow curve method
-Silhouette scores
-In [ ]:
+In [54]:
sil_score = []
cluster_list = list(range(2, 10))
for n_clusters in cluster_list:
clusterer = KMeans(n_clusters=n_clusters)
- preds = clusterer.fit_predict((ALB_scaled_df))
+ preds = clusterer.fit_predict((FI_scaled_df))
# centers = clusterer.cluster_centers_
- score = silhouette_score(ALB_scaled_df, preds)
+ score = silhouette_score(FI_scaled_df, preds)
sil_score.append(score)
print("For n_clusters = {}, the silhouette score is {})".format(n_clusters, score))
@@ -20324,9 +21755,27 @@ Silhouette scores
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -20343,11 +21792,11 @@ Silhouette scoresSilhouette scores
-Silhouette scores
-In [ ]:
+In [55]:
# finding optimal no. of clusters with silhouette coefficients k=3
visualizer = SilhouetteVisualizer(KMeans(3, random_state=1))
-visualizer.fit(ALB_scaled_df)
+visualizer.fit(FI_scaled_df)
visualizer.show()
@@ -20435,9 +21884,27 @@ Silhouette scores
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -20465,7 +21932,7 @@ Silhouette scores
- Out[ ]:
+ Out[55]:
@@ -20499,12 +21966,12 @@ Silhouette scores
-In [ ]:
+In [56]:
# finding optimal no. of clusters with silhouette coefficients k=4
visualizer = SilhouetteVisualizer(KMeans(4, random_state=1))
-visualizer.fit(ALB_scaled_df)
+visualizer.fit(FI_scaled_df)
visualizer.show()
@@ -20529,9 +21996,27 @@ Silhouette scores
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -20559,7 +22044,7 @@ Silhouette scores
- Out[ ]:
+ Out[56]:
@@ -20593,12 +22078,12 @@ Silhouette scores
-In [ ]:
+In [57]:
# finding optimal no. of clusters with silhouette coefficients k=2
visualizer = SilhouetteVisualizer(KMeans(2, random_state=1))
-visualizer.fit(ALB_scaled_df)
+visualizer.fit(FI_scaled_df)
visualizer.show()
@@ -20623,9 +22108,27 @@ Silhouette scores
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -20653,7 +22156,7 @@ Silhouette scores
- Out[ ]:
+ Out[57]:
@@ -20699,19 +22202,19 @@ Silhouette scores
-In [ ]:
+In [58]:
#since we are going to compare our results for different ks, lets make different
#dataframes for each k
-ALB_k3_df = ALB_df.copy()
-ALB_scaled_k3_df = ALB_scaled_df.copy()
+FI_k3_df = FI_df.copy()
+FI_scaled_k3_df = FI_scaled_df.copy()
-ALB_k4_df = ALB_df.copy()
-ALB_scaled_k4_df = ALB_scaled_df.copy()
+FI_k4_df = FI_df.copy()
+FI_scaled_k4_df = FI_scaled_df.copy()
-ALB_k2_df = ALB_df.copy()
-ALB_scaled_k2_df = ALB_scaled_df.copy()
+FI_k2_df = FI_df.copy()
+FI_scaled_k2_df = FI_scaled_df.copy()
@@ -20735,9 +22238,27 @@ Silhouette scores
+
+
+
+
+
+
-In [ ]:
+In [59]:
# let's take 3 as number of clusters
kmeans_k3 = KMeans(n_clusters=3, random_state=1)
-kmeans_k3.fit(ALB_scaled_k3_df)
-prediction_k3 = kmeans_k3.predict(ALB_scaled_k3_df)
+kmeans_k3.fit(FI_scaled_k3_df)
+prediction_k3 = kmeans_k3.predict(FI_scaled_k3_df)
@@ -20818,9 +22339,27 @@ Cluster profiling
+
+
+
+
+
+
+
-In [ ]:
+In [60]:
# adding kmeans cluster labels to the original and scaled dataframes
-ALB_k3_df["SEGMENT"] = prediction_k3
-ALB_scaled_k3_df["SEGMENT"] = prediction_k3
+FI_k3_df["SEGMENT"] = prediction_k3
+FI_scaled_k3_df["SEGMENT"] = prediction_k3
print("SEGMENTS Assigned : \n")
-ALB_k3_df.head(20)
+FI_k3_df.head(20)
@@ -20867,9 +22406,27 @@ Cluster profiling
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -20892,13 +22449,13 @@ Cluster profiling
- Out[ ]:
+ Out[60]:
-In [ ]:
+In [61]:
#Averages for each feature when k=3
-cluster_profile_k3 = ALB_k3_df.groupby("SEGMENT").mean()
+cluster_profile_k3 = FI_k3_df.groupby("SEGMENT").mean()
@@ -21291,9 +22917,27 @@ Cluster profiling
+
+
+
+
+
+
+
-In [ ]:
+In [62]:
#plot counts for each segment
-labeled_barplot(ALB_scaled_k3_df, "SEGMENT", perc=True, n=None)
+labeled_barplot(FI_scaled_k3_df, "SEGMENT", perc=True, n=None)
@@ -21337,9 +22981,27 @@ Cluster profiling
+
+
+
+
+
+
+
-In [ ]:
+In [63]:
cluster_profile_k3["count_in_each_segments"] = (
- ALB_k3_df.groupby("SEGMENT")["Avg_Credit_Limit"].count().values
+ FI_k3_df.groupby("SEGMENT")["Avg_Credit_Limit"].count().values
)
@@ -21414,9 +23076,27 @@ Cluster profiling
+
+
+
+
+
+
+
-In [ ]:
+In [64]:
# let's display cluster profiles
@@ -21460,9 +23140,27 @@ Cluster profiling
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -21472,26 +23170,26 @@ Cluster profiling
- Out[ ]:
+ Out[64]:
-