-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
bc43599
commit 5b238a2
Showing
1 changed file
with
167 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
# Below implementation of the customer segmentation analysis covers all the main aspects of the analysis, including: | ||
# 1.Data loading and preprocessing | ||
# 2.Exploratory Data Analysis (EDA) | ||
# 3.Feature engineering and normalization | ||
# 4.Principal Component Analysis (PCA) | ||
# 5.K-means clustering | ||
# 6.UMAP visualization | ||
# 7.Cluster analysis | ||
# 8.Customer persona creation | ||
|
||
import pandas as pd | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
import seaborn as sns | ||
from sklearn.preprocessing import StandardScaler | ||
from sklearn.decomposition import PCA | ||
from sklearn.cluster import KMeans | ||
import umap.umap_ as umap | ||
import os | ||
|
||
# Set random seed for reproducibility | ||
np.random.seed(42) | ||
|
||
# Load the data | ||
customer_df = pd.read_csv("/Users/supriyakushwaha/Desktop/Projects/Customer_Segmentation_Banking/Marketing_data.csv") | ||
|
||
|
||
# Clean column names | ||
customer_df.columns = customer_df.columns.str.lower().str.replace(' ', '_') | ||
|
||
# Handle missing values | ||
customer_df['minimum_payments'] = customer_df['minimum_payments'].fillna(customer_df['minimum_payments'].median()) | ||
customer_df['credit_limit'] = customer_df['credit_limit'].fillna(customer_df['credit_limit'].median()) | ||
|
||
# Create credit utilization feature | ||
customer_df['credit_utilization'] = customer_df['balance'] / customer_df['credit_limit'] | ||
|
||
# Filter for customers with tenure of 10 months | ||
df_filtered = customer_df[customer_df['tenure'] == 10].copy() | ||
|
||
# Drop unnecessary columns | ||
columns_to_drop = ['cust_id', 'installments_purchases'] | ||
df_filtered = df_filtered.drop(columns=columns_to_drop) | ||
|
||
# Exploratory Data Analysis functions | ||
def plot_histograms(df): | ||
n_cols = len(df.columns) | ||
n_rows = (n_cols + 3) // 4 # Round up to the nearest multiple of 4 | ||
fig, axes = plt.subplots(n_rows, 4, figsize=(20, 5*n_rows)) | ||
axes = axes.flatten() # Flatten the 2D array of axes | ||
|
||
for i, col in enumerate(df.columns): | ||
if i < len(axes): | ||
sns.histplot(df[col], ax=axes[i], kde=True) | ||
axes[i].set_title(col) | ||
|
||
# Hide any unused subplots | ||
for j in range(i+1, len(axes)): | ||
fig.delaxes(axes[j]) | ||
|
||
plt.tight_layout() | ||
plt.show() | ||
|
||
def plot_credit_utilization(df): | ||
plt.figure(figsize=(12, 6)) | ||
sns.histplot(df[df['credit_utilization'] <= 1]['credit_utilization'], kde=True) | ||
plt.title('Credit Utilization Distribution (<=100%)') | ||
plt.show() | ||
|
||
plt.figure(figsize=(12, 6)) | ||
sns.histplot(df[df['credit_utilization'] > 1]['credit_utilization'], kde=True) | ||
plt.title('Credit Utilization Distribution (>100%)') | ||
plt.show() | ||
|
||
def plot_cash_advance_cohorts(df): | ||
df['cash_advance_cohort'] = pd.cut(df['cash_advance'], | ||
bins=[-np.inf, 1000, 5000, np.inf], | ||
labels=['Low', 'Medium', 'High']) | ||
|
||
plt.figure(figsize=(12, 6)) | ||
sns.barplot(x='cash_advance_cohort', y='cash_advance', data=df[df['purchases'] == 0]) | ||
plt.title('Average Cash Advance by Cohort (Customers with Zero Purchases)') | ||
plt.show() | ||
|
||
# Call the EDA functions | ||
plot_histograms(df_filtered) | ||
plot_credit_utilization(df_filtered) | ||
plot_cash_advance_cohorts(df_filtered) | ||
|
||
# Feature Engineering and Normalization | ||
features = df_filtered.columns.drop(['tenure', 'cash_advance_cohort']) | ||
scaler = StandardScaler() | ||
df_normalized = pd.DataFrame(scaler.fit_transform(df_filtered[features]), columns=features) | ||
|
||
# Apply PCA | ||
pca = PCA(n_components=0.8, random_state=42) | ||
df_pca = pd.DataFrame(pca.fit_transform(df_normalized)) | ||
|
||
# Determine optimal number of clusters | ||
def plot_elbow_curve(data): | ||
wcss = [] | ||
for i in range(1, 11): | ||
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42) | ||
kmeans.fit(data) | ||
wcss.append(kmeans.inertia_) | ||
|
||
plt.figure(figsize=(10, 6)) | ||
plt.plot(range(1, 11), wcss, marker='o') | ||
plt.title('Elbow Method') | ||
plt.xlabel('Number of clusters') | ||
plt.ylabel('WCSS') | ||
plt.show() | ||
|
||
plot_elbow_curve(df_pca) | ||
|
||
# Apply K-Means clustering | ||
kmeans = KMeans(n_clusters=4, random_state=42) | ||
df_filtered['cluster'] = kmeans.fit_predict(df_pca) | ||
|
||
# Visualize clusters using UMAP | ||
reducer = umap.UMAP(random_state=42) | ||
embedding = reducer.fit_transform(df_pca) | ||
|
||
plt.figure(figsize=(12, 8)) | ||
scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=df_filtered['cluster'], cmap='viridis') | ||
plt.colorbar(scatter) | ||
plt.title('UMAP projection of the clusters') | ||
plt.show() | ||
|
||
# Analyze clusters | ||
def plot_feature_distribution(df, features): | ||
fig, axes = plt.subplots(2, 2, figsize=(20, 20)) | ||
axes = axes.ravel() | ||
|
||
for i, feature in enumerate(features): | ||
sns.boxplot(x='cluster', y=feature, data=df, ax=axes[i]) | ||
axes[i].set_title(f'Distribution of {feature} by Cluster') | ||
|
||
plt.tight_layout() | ||
plt.show() | ||
|
||
features_to_analyze = ['balance', 'cash_advance', 'credit_utilization', 'purchases'] | ||
plot_feature_distribution(df_filtered, features_to_analyze) | ||
|
||
features_to_analyze = ['credit_limit', 'minimum_payments', 'payments', 'purchases_trx'] | ||
plot_feature_distribution(df_filtered, features_to_analyze) | ||
|
||
# Create customer personas | ||
def create_persona(cluster_data): | ||
persona = { | ||
'credit_utilization': f"{cluster_data['credit_utilization'].median():.2f} - {cluster_data['credit_utilization'].max():.2f}", | ||
'cash_advance': f"${cluster_data['cash_advance'].median():.0f} - ${cluster_data['cash_advance'].max():.0f}", | ||
'purchases': f"${cluster_data['purchases'].median():.0f} - ${cluster_data['purchases'].max():.0f}", | ||
'balance': f"${cluster_data['balance'].median():.0f} - ${cluster_data['balance'].max():.0f}" | ||
} | ||
return persona | ||
|
||
personas = {} | ||
for cluster in df_filtered['cluster'].unique(): | ||
cluster_data = df_filtered[df_filtered['cluster'] == cluster] | ||
personas[f"Cluster {cluster}"] = create_persona(cluster_data) | ||
|
||
for cluster, persona in personas.items(): | ||
print(f"{cluster}:") | ||
for key, value in persona.items(): | ||
print(f" {key}: {value}") | ||
print() |