Added Clustering Models and dataset. Updated the README

screwgoth · Jun 2, 2020 · 9640bb1 · 9640bb1
1 parent 80c6f6b
commit 9640bb1
Show file tree

Hide file tree

Showing 4 changed files with 620 additions and 0 deletions.
diff --git a/Clustering/Hierarchical Clustering/hierarchical_clustering.ipynb b/Clustering/Hierarchical Clustering/hierarchical_clustering.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "JKkbeQi2Mzug"
+   },
+   "source": [
+    "# Hierarchical Clustering"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "TaQI437hM1Ho"
+   },
+   "source": [
+    "## Importing the libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "2UW48DgcM4YS"
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "gFeTEtDxM7K4"
+   },
+   "source": [
+    "## Importing the dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "4fS2J3HGM99q"
+   },
+   "outputs": [],
+   "source": [
+    "dataset = pd.read_csv('../../datasets/Mall_Customers.csv')\n",
+    "X = dataset.iloc[:, [3, 4]].values"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "czYMlG7cNBsu"
+   },
+   "source": [
+    "## Using the dendrogram to find the optimal number of clusters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 295
+    },
+    "colab_type": "code",
+    "executionInfo": {
+     "elapsed": 4948,
+     "status": "ok",
+     "timestamp": 1588363683148,
+     "user": {
+      "displayName": "Hadelin de Ponteves",
+      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhEuXdT7eQweUmRPW8_laJuPggSK6hfvpl5a6WBaA=s64",
+      "userId": "15047218817161520419"
+     },
+     "user_tz": -240
+    },
+    "id": "RDQODpAFNILO",
+    "outputId": "8743058d-09a8-43f5-892d-6b1c140792a0"
+   },
+   "outputs": [],
+   "source": [
+    "import scipy.cluster.hierarchy as sch\n",
+    "dendrogram = sch.dendrogram(sch.linkage(X, method = 'ward'))\n",
+    "plt.title('Dendrogram')\n",
+    "plt.xlabel('Customers')\n",
+    "plt.ylabel('Euclidean distances')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "KDbXbo9INLF6"
+   },
+   "source": [
+    "## Training the Hierarchical Clustering model on the dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "IoH3zs2KNSw6"
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.cluster import AgglomerativeClustering\n",
+    "hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'ward')\n",
+    "y_hc = hc.fit_predict(X)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "X-SYG7l9NVmU"
+   },
+   "source": [
+    "## Visualising the clusters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 295
+    },
+    "colab_type": "code",
+    "executionInfo": {
+     "elapsed": 1290,
+     "status": "ok",
+     "timestamp": 1588363703003,
+     "user": {
+      "displayName": "Hadelin de Ponteves",
+      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhEuXdT7eQweUmRPW8_laJuPggSK6hfvpl5a6WBaA=s64",
+      "userId": "15047218817161520419"
+     },
+     "user_tz": -240
+    },
+    "id": "-91tDJrnNY2p",
+    "outputId": "fc9652fa-6e3f-4b68-c4ff-e6fd6b4bce7d"
+   },
+   "outputs": [],
+   "source": [
+    "plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')\n",
+    "plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')\n",
+    "plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')\n",
+    "plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')\n",
+    "plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')\n",
+    "plt.title('Clusters of customers')\n",
+    "plt.xlabel('Annual Income (k$)')\n",
+    "plt.ylabel('Spending Score (1-100)')\n",
+    "plt.legend()\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "authorship_tag": "ABX9TyOE/Ghkv22sqrXHjexUJwPA",
+   "collapsed_sections": [],
+   "name": "hierarchical_clustering.ipynb",
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}