diff --git a/Clustering/Hierarchical Clustering/hierarchical_clustering.ipynb b/Clustering/Hierarchical Clustering/hierarchical_clustering.ipynb new file mode 100644 index 0000000..fd031f1 --- /dev/null +++ b/Clustering/Hierarchical Clustering/hierarchical_clustering.ipynb @@ -0,0 +1,206 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "JKkbeQi2Mzug" + }, + "source": [ + "# Hierarchical Clustering" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "TaQI437hM1Ho" + }, + "source": [ + "## Importing the libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "2UW48DgcM4YS" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "gFeTEtDxM7K4" + }, + "source": [ + "## Importing the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "4fS2J3HGM99q" + }, + "outputs": [], + "source": [ + "dataset = pd.read_csv('../../datasets/Mall_Customers.csv')\n", + "X = dataset.iloc[:, [3, 4]].values" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "czYMlG7cNBsu" + }, + "source": [ + "## Using the dendrogram to find the optimal number of clusters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 295 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 4948, + "status": "ok", + "timestamp": 1588363683148, + "user": { + "displayName": "Hadelin de Ponteves", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhEuXdT7eQweUmRPW8_laJuPggSK6hfvpl5a6WBaA=s64", + "userId": "15047218817161520419" + }, + "user_tz": -240 + }, + "id": "RDQODpAFNILO", + "outputId": "8743058d-09a8-43f5-892d-6b1c140792a0" + }, + "outputs": [], + "source": [ + "import scipy.cluster.hierarchy as sch\n", + "dendrogram = sch.dendrogram(sch.linkage(X, method = 'ward'))\n", + "plt.title('Dendrogram')\n", + "plt.xlabel('Customers')\n", + "plt.ylabel('Euclidean distances')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "KDbXbo9INLF6" + }, + "source": [ + "## Training the Hierarchical Clustering model on the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "IoH3zs2KNSw6" + }, + "outputs": [], + "source": [ + "from sklearn.cluster import AgglomerativeClustering\n", + "hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'ward')\n", + "y_hc = hc.fit_predict(X)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "X-SYG7l9NVmU" + }, + "source": [ + "## Visualising the clusters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 295 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 1290, + "status": "ok", + "timestamp": 1588363703003, + "user": { + "displayName": "Hadelin de Ponteves", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhEuXdT7eQweUmRPW8_laJuPggSK6hfvpl5a6WBaA=s64", + "userId": "15047218817161520419" + }, + "user_tz": -240 + }, + "id": "-91tDJrnNY2p", + "outputId": "fc9652fa-6e3f-4b68-c4ff-e6fd6b4bce7d" + }, + "outputs": [], + "source": [ + "plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')\n", + "plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')\n", + "plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')\n", + "plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')\n", + "plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')\n", + "plt.title('Clusters of customers')\n", + "plt.xlabel('Annual Income (k$)')\n", + "plt.ylabel('Spending Score (1-100)')\n", + "plt.legend()\n", + "plt.show()" + ] + } + ], + "metadata": { + "colab": { + "authorship_tag": "ABX9TyOE/Ghkv22sqrXHjexUJwPA", + "collapsed_sections": [], + "name": "hierarchical_clustering.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/Clustering/K-Means Clustering/k_means_clustering.ipynb b/Clustering/K-Means Clustering/k_means_clustering.ipynb new file mode 100644 index 0000000..0ae0f1c --- /dev/null +++ b/Clustering/K-Means Clustering/k_means_clustering.ipynb @@ -0,0 +1,210 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "i_paGd_yLbgH" + }, + "source": [ + "# K-Means Clustering" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "nAuqPwTnLipr" + }, + "source": [ + "## Importing the libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "yUXGcC4KLmcL" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "5LciKOr8Lo5O" + }, + "source": [ + "## Importing the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "9RlmPzZGLtGi" + }, + "outputs": [], + "source": [ + "dataset = pd.read_csv('../../datasets/Mall_Customers.csv')\n", + "X = dataset.iloc[:, [3, 4]].values" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "nWC2EWp2Lx5G" + }, + "source": [ + "## Using the elbow method to find the optimal number of clusters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 295 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 1431, + "status": "ok", + "timestamp": 1588239707642, + "user": { + "displayName": "Hadelin de Ponteves", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhEuXdT7eQweUmRPW8_laJuPggSK6hfvpl5a6WBaA=s64", + "userId": "15047218817161520419" + }, + "user_tz": -240 + }, + "id": "zWs6ciOoL1b3", + "outputId": "bd3740ef-650e-4ae3-da64-821d7df9c7d0" + }, + "outputs": [], + "source": [ + "from sklearn.cluster import KMeans\n", + "wcss = []\n", + "for i in range(1, 11):\n", + " kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)\n", + " kmeans.fit(X)\n", + " wcss.append(kmeans.inertia_)\n", + "plt.plot(range(1, 11), wcss)\n", + "plt.title('The Elbow Method')\n", + "plt.xlabel('Number of clusters')\n", + "plt.ylabel('WCSS')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "VgT0mANLL4Nz" + }, + "source": [ + "## Training the K-Means model on the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "cjEfU6ZSMAPl" + }, + "outputs": [], + "source": [ + "kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)\n", + "y_kmeans = kmeans.fit_predict(X)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "e7YrS1JAMFnm" + }, + "source": [ + "## Visualising the clusters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 295 + }, + "colab_type": "code", + "executionInfo": { + "elapsed": 1085, + "status": "ok", + "timestamp": 1588239716139, + "user": { + "displayName": "Hadelin de Ponteves", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhEuXdT7eQweUmRPW8_laJuPggSK6hfvpl5a6WBaA=s64", + "userId": "15047218817161520419" + }, + "user_tz": -240 + }, + "id": "d0ZYecccMHNx", + "outputId": "2db95eb9-0c99-4718-f28c-1d3155258041" + }, + "outputs": [], + "source": [ + "plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')\n", + "plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')\n", + "plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')\n", + "plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')\n", + "plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')\n", + "plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')\n", + "plt.title('Clusters of customers')\n", + "plt.xlabel('Annual Income (k$)')\n", + "plt.ylabel('Spending Score (1-100)')\n", + "plt.legend()\n", + "plt.show()" + ] + } + ], + "metadata": { + "colab": { + "authorship_tag": "ABX9TyN979dFKn9B6Ro9v0hJ4uqU", + "name": "k_means_clustering.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/README.md b/README.md index d741f57..e94e0c7 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,9 @@ The repository is composed of the following sections: * Naive Bayes * Decision Tree Classification * Random Forest Classification +* `Clustering` : This folder contain Jupyter notebooks for the following Clustering Models: + * K-Means Clustering + * Hierarchical Clustering diff --git a/datasets/Mall_Customers.csv b/datasets/Mall_Customers.csv new file mode 100644 index 0000000..b324941 --- /dev/null +++ b/datasets/Mall_Customers.csv @@ -0,0 +1,201 @@ +CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100) +0001,Male,19,15,39 +0002,Male,21,15,81 +0003,Female,20,16,6 +0004,Female,23,16,77 +0005,Female,31,17,40 +0006,Female,22,17,76 +0007,Female,35,18,6 +0008,Female,23,18,94 +0009,Male,64,19,3 +0010,Female,30,19,72 +0011,Male,67,19,14 +0012,Female,35,19,99 +0013,Female,58,20,15 +0014,Female,24,20,77 +0015,Male,37,20,13 +0016,Male,22,20,79 +0017,Female,35,21,35 +0018,Male,20,21,66 +0019,Male,52,23,29 +0020,Female,35,23,98 +0021,Male,35,24,35 +0022,Male,25,24,73 +0023,Female,46,25,5 +0024,Male,31,25,73 +0025,Female,54,28,14 +0026,Male,29,28,82 +0027,Female,45,28,32 +0028,Male,35,28,61 +0029,Female,40,29,31 +0030,Female,23,29,87 +0031,Male,60,30,4 +0032,Female,21,30,73 +0033,Male,53,33,4 +0034,Male,18,33,92 +0035,Female,49,33,14 +0036,Female,21,33,81 +0037,Female,42,34,17 +0038,Female,30,34,73 +0039,Female,36,37,26 +0040,Female,20,37,75 +0041,Female,65,38,35 +0042,Male,24,38,92 +0043,Male,48,39,36 +0044,Female,31,39,61 +0045,Female,49,39,28 +0046,Female,24,39,65 +0047,Female,50,40,55 +0048,Female,27,40,47 +0049,Female,29,40,42 +0050,Female,31,40,42 +0051,Female,49,42,52 +0052,Male,33,42,60 +0053,Female,31,43,54 +0054,Male,59,43,60 +0055,Female,50,43,45 +0056,Male,47,43,41 +0057,Female,51,44,50 +0058,Male,69,44,46 +0059,Female,27,46,51 +0060,Male,53,46,46 +0061,Male,70,46,56 +0062,Male,19,46,55 +0063,Female,67,47,52 +0064,Female,54,47,59 +0065,Male,63,48,51 +0066,Male,18,48,59 +0067,Female,43,48,50 +0068,Female,68,48,48 +0069,Male,19,48,59 +0070,Female,32,48,47 +0071,Male,70,49,55 +0072,Female,47,49,42 +0073,Female,60,50,49 +0074,Female,60,50,56 +0075,Male,59,54,47 +0076,Male,26,54,54 +0077,Female,45,54,53 +0078,Male,40,54,48 +0079,Female,23,54,52 +0080,Female,49,54,42 +0081,Male,57,54,51 +0082,Male,38,54,55 +0083,Male,67,54,41 +0084,Female,46,54,44 +0085,Female,21,54,57 +0086,Male,48,54,46 +0087,Female,55,57,58 +0088,Female,22,57,55 +0089,Female,34,58,60 +0090,Female,50,58,46 +0091,Female,68,59,55 +0092,Male,18,59,41 +0093,Male,48,60,49 +0094,Female,40,60,40 +0095,Female,32,60,42 +0096,Male,24,60,52 +0097,Female,47,60,47 +0098,Female,27,60,50 +0099,Male,48,61,42 +0100,Male,20,61,49 +0101,Female,23,62,41 +0102,Female,49,62,48 +0103,Male,67,62,59 +0104,Male,26,62,55 +0105,Male,49,62,56 +0106,Female,21,62,42 +0107,Female,66,63,50 +0108,Male,54,63,46 +0109,Male,68,63,43 +0110,Male,66,63,48 +0111,Male,65,63,52 +0112,Female,19,63,54 +0113,Female,38,64,42 +0114,Male,19,64,46 +0115,Female,18,65,48 +0116,Female,19,65,50 +0117,Female,63,65,43 +0118,Female,49,65,59 +0119,Female,51,67,43 +0120,Female,50,67,57 +0121,Male,27,67,56 +0122,Female,38,67,40 +0123,Female,40,69,58 +0124,Male,39,69,91 +0125,Female,23,70,29 +0126,Female,31,70,77 +0127,Male,43,71,35 +0128,Male,40,71,95 +0129,Male,59,71,11 +0130,Male,38,71,75 +0131,Male,47,71,9 +0132,Male,39,71,75 +0133,Female,25,72,34 +0134,Female,31,72,71 +0135,Male,20,73,5 +0136,Female,29,73,88 +0137,Female,44,73,7 +0138,Male,32,73,73 +0139,Male,19,74,10 +0140,Female,35,74,72 +0141,Female,57,75,5 +0142,Male,32,75,93 +0143,Female,28,76,40 +0144,Female,32,76,87 +0145,Male,25,77,12 +0146,Male,28,77,97 +0147,Male,48,77,36 +0148,Female,32,77,74 +0149,Female,34,78,22 +0150,Male,34,78,90 +0151,Male,43,78,17 +0152,Male,39,78,88 +0153,Female,44,78,20 +0154,Female,38,78,76 +0155,Female,47,78,16 +0156,Female,27,78,89 +0157,Male,37,78,1 +0158,Female,30,78,78 +0159,Male,34,78,1 +0160,Female,30,78,73 +0161,Female,56,79,35 +0162,Female,29,79,83 +0163,Male,19,81,5 +0164,Female,31,81,93 +0165,Male,50,85,26 +0166,Female,36,85,75 +0167,Male,42,86,20 +0168,Female,33,86,95 +0169,Female,36,87,27 +0170,Male,32,87,63 +0171,Male,40,87,13 +0172,Male,28,87,75 +0173,Male,36,87,10 +0174,Male,36,87,92 +0175,Female,52,88,13 +0176,Female,30,88,86 +0177,Male,58,88,15 +0178,Male,27,88,69 +0179,Male,59,93,14 +0180,Male,35,93,90 +0181,Female,37,97,32 +0182,Female,32,97,86 +0183,Male,46,98,15 +0184,Female,29,98,88 +0185,Female,41,99,39 +0186,Male,30,99,97 +0187,Female,54,101,24 +0188,Male,28,101,68 +0189,Female,41,103,17 +0190,Female,36,103,85 +0191,Female,34,103,23 +0192,Female,32,103,69 +0193,Male,33,113,8 +0194,Female,38,113,91 +0195,Female,47,120,16 +0196,Female,35,120,79 +0197,Female,45,126,28 +0198,Male,32,126,74 +0199,Male,32,137,18 +0200,Male,30,137,83 \ No newline at end of file