diff --git a/DNA CLASSIFICATION/DNA_Classification_Code.ipynb b/DNA CLASSIFICATION/DNA_Classification_Code.ipynb new file mode 100644 index 00000000..0d49afca --- /dev/null +++ b/DNA CLASSIFICATION/DNA_Classification_Code.ipynb @@ -0,0 +1,3293 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "QkgQB_-a6PfP" + }, + "source": [ + "# DNA Classification Using Machine Learning" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5cJqFevm6PfW" + }, + "source": [ + "## About :\n", + "In this project, we will explore the world of bioinformatics by using Markov models, K-nearest neighbor (KNN) algorithms, support vector machines, and other common classifiers to classify short E. Coli DNA sequences. This project will use a dataset from the UCI Machine Learning Repository that has 106 DNA sequences, with 57 sequential nucleotides (“base-pairs”) each.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "It includes :\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "N4d5V74G6Pfa" + }, + "outputs": [], + "source": [ + "# Hide warnings\n", + "import warnings\n", + "warnings.simplefilter('ignore')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UoHvr8t66Pfe" + }, + "source": [ + "## Step 1: Importing the Dataset\n", + "\n", + "The following code cells will import necessary libraries and import the dataset from the UCI repository as a Pandas DataFram" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "njKpY80E6Pfg" + }, + "outputs": [], + "source": [ + "#import and change module name\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'\n", + "names = ['Class', 'id', 'Sequence']\n", + "data = pd.read_csv(url, names = names)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "SZlIvNHq6Pfh", + "outputId": "c3ad494b-07cb-4d4b-b7cb-061d331b6109" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Index(['Class', 'id', 'Sequence'], dtype='object')" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "data.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "v3Ffs5eT6Pfk", + "outputId": "6a6e13da-3e36-4876-e7fb-d796c925f1ac" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Class id Sequence\n", + "0 + S10 \\t\\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...\n", + "1 + AMPC \\t\\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...\n", + "2 + AROH \\t\\tgtactagagaactagtgcattagcttatttttttgttatcat...\n", + "3 + DEOP2 \\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...\n", + "4 + LEU1_TRNA \\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc..." + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ClassidSequence
0+S10\\t\\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1+AMPC\\t\\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2+AROH\\t\\tgtactagagaactagtgcattagcttatttttttgttatcat...
3+DEOP2\\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4+LEU1_TRNA\\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "data", + "summary": "{\n \"name\": \"data\",\n \"rows\": 106,\n \"fields\": [\n {\n \"column\": \"Class\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"-\",\n \"+\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 106,\n \"samples\": [\n \" 663\",\n \"RRNAB_P2\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Sequence\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 106,\n \"samples\": [\n \"\\t\\tgagagcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccctacg\",\n \"\\tgcaaaaataaatgcttgactctgtagcgggaaggcgtattatgcacaccccgcgccg\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 7 + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1nnvISj56Pfl", + "outputId": "b15d7b8f-929d-44aa-882d-de53014016c7" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(106, 3)" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ], + "source": [ + "data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 178 + }, + "id": "DpVVtx1T6Pfn", + "outputId": "4f00a6be-63dd-40c6-de73-e75a11496c0a" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Class object\n", + "id object\n", + "Sequence object\n", + "dtype: object" + ], + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
Classobject
idobject
Sequenceobject

" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ], + "source": [ + "data.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NbqALsWn6Pfo" + }, + "source": [ + "## Step 2: Preprocessing the Dataset\n", + "\n", + "The data is not in a usable form; as a result, we will need to process it before using it to train our algorithms." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 240 + }, + "id": "X9RgQSv86Pfp", + "outputId": "fa3ed198-56ce-4c91-ab1a-69edccc3f14c" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 +\n", + "1 +\n", + "2 +\n", + "3 +\n", + "4 +\n", + "Name: Class, dtype: object" + ], + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Class
0+
1+
2+
3+
4+

" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ], + "source": [ + "# Build our dataset using custom pandas dataframe\n", + "clases = data.loc[:,'Class']\n", + "clases.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WIRgNNAN6Pfq", + "outputId": "596a7289-d5d8-419e-a0f3-aeb156a5cb20" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['\\t\\ttactagcaatacgcttgcgttcggtggttaagtatgtataatgcgcgggcttgtcgt',\n", + " '\\t\\ttgctatcctgacagttgtcacgctgattggtgtcgttacaatctaacgcatcgccaa',\n", + " '\\t\\tgtactagagaactagtgcattagcttatttttttgttatcatgctaaccacccggcg',\n", + " '\\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaatactaacaaactc',\n", + " '\\ttcgataattaactattgacgaaaagctgaaaaccactagaatgcgcctccgtggtag',\n", + " '\\taggggcaaggaggatggaaagaggttgccgtataaagaaactagagtccgtttaggt',\n", + " '\\t\\tcagggggtggaggatttaagccatctcctgatgacgcatagtcagcccatcatgaat',\n", + " '\\t\\ttttctacaaaacacttgatactgtatgagcatacagtataattgcttcaacagaaca',\n", + " '\\t\\tcgacttaatatactgcgacaggacgtccgttctgtgtaaatcgcaatgaaatggttt',\n", + " '\\tttttaaatttcctcttgtcaggccggaataactccctataatgcgccaccactgaca',\n", + " '\\tgcaaaaataaatgcttgactctgtagcgggaaggcgtattatgcacaccccgcgccg',\n", + " '\\tcctgaaattcagggttgactctgaaagaggaaagcgtaatatacgccacctcgcgac',\n", + " '\\tgatcaaaaaaatacttgtgcaaaaaattgggatccctataatgcgcctccgttgaga',\n", + " '\\tctgcaatttttctattgcggcctgcggagaactccctataatgcgcctccatcgaca',\n", + " '\\ttttatatttttcgcttgtcaggccggaataactccctataatgcgccaccactgaca',\n", + " '\\taagcaaagaaatgcttgactctgtagcgggaaggcgtattatgcacaccgccgcgcc',\n", + " '\\tatgcatttttccgcttgtcttcctgagccgactccctataatgcgcctccatcgaca',\n", + " '\\t\\taaacaatttcagaatagacaaaaactctgagtgtaataatgtagcctcgtgtcttgc',\n", + " '\\t\\ttctcaacgtaacactttacagcggcgcgtcatttgatatgatgcgccccgcttcccg',\n", + " '\\t\\tgcaaataatcaatgtggacttttctgccgtgattatagacacttttgttacgcgttt',\n", + " '\\t\\tgacaccatcgaatggcgcaaaacctttcgcggtatggcatgatagcgcccggaagag',\n", + " '\\t\\taaaaacgtcatcgcttgcattagaaaggtttctggccgaccttataaccattaatta',\n", + " '\\t\\ttctgaaatgagctgttgacaattaatcatcgaactagttaactagtacgcaagttca',\n", + " '\\taccggaagaaaaccgtgacattttaacacgtttgttacaaggtaaaggcgacgccgc',\n", + " '\\t\\taaattaaaattttattgacttaggtcactaaatactttaaccaatataggcatagcg',\n", + " '\\t\\tttgtcataatcgacttgtaaaccaaattgaaaagatttaggtttacaagtctacacc',\n", + " '\\t\\tcatcctcgcaccagtcgacgacggtttacgctttacgtatagtggcgacaatttttt',\n", + " '\\ttccagtataatttgttggcataattaagtacgacgagtaaaattacatacctgcccg',\n", + " '\\tacagttatccactattcctgtggataaccatgtgtattagagttagaaaacacgagg',\n", + " '\\t\\ttgtgcagtttatggttccaaaatcgccttttgctgtatatactcacagcataactgt',\n", + " '\\tctgttgttcagtttttgagttgtgtataacccctcattctgatcccagcttatacgg',\n", + " '\\tattacaaaaagtgctttctgaactgaacaaaaaagagtaaagttagtcgcgtagggt',\n", + " '\\tatgcgcaacgcggggtgacaagggcgcgcaaaccctctatactgcgcgccgaagctg',\n", + " '\\t\\ttaaaaaactaacagttgtcagcctgtcccgcttataagatcatacgccgttatacgt',\n", + " '\\t\\tatgcaattttttagttgcatgaactcgcatgtctccatagaatgcgcgctacttgat',\n", + " '\\tccttgaaaaagaggttgacgctgcaaggctctatacgcataatgcgccccgcaacgc',\n", + " '\\t\\ttcgttgtatatttcttgacaccttttcggcatcgccctaaaattcggcgtcctcata',\n", + " '\\t\\tccgtttattttttctacccatatccttgaagcggtgttataatgccgcgccctcgat',\n", + " '\\t\\tttcgcatatttttcttgcaaagttgggttgagctggctagattagccagccaatctt',\n", + " '\\t\\ttgtaaactaatgcctttacgtgggcggtgattttgtctacaatcttacccccacgta',\n", + " '\\tgatcgcacgatctgtatacttatttgagtaaattaacccacgatcccagccattctt',\n", + " '\\t\\taacgcatacggtattttaccttcccagtcaagaaaacttatcttattcccacttttc',\n", + " '\\tttagcggatcctacctgacgctttttatcgcaactctctactgtttctccatacccg',\n", + " '\\t\\tgccttctccaaaacgtgttttttgttgttaattcggtgtagacttgtaaacctaaat',\n", + " '\\tcagaaacgttttattcgaacatcgatctcgtcttgtgttagaattctaacatacggt',\n", + " '\\tcactaatttattccatgtcacacttttcgcatctttgttatgctatggttatttcat',\n", + " '\\t\\tatataaaaaagttcttgctttctaacgtgaaagtggtttaggttaaaagacatcagt',\n", + " '\\t\\tcaaggtagaatgctttgccttgtcggcctgattaatggcacgatagtcgcatcggat',\n", + " '\\tggccaaaaaatatcttgtactatttacaaaacctatggtaactctttaggcattcct',\n", + " '\\ttaggcaccccaggctttacactttatgcttccggctcgtatgttgtgtggaattgtg',\n", + " '\\t\\tccatcaaaaaaatattctcaacataaaaaactttgtgtaatacttgtaacgctacat',\n", + " '\\t\\ttggggacgtcgttactgatccgcacgtttatgatatgctatcgtactctttagcgag',\n", + " '\\ttcagaaatattatggtgatgaactgtttttttatccagtataatttgttggcataat',\n", + " '\\t\\tatatgaacgttgagactgccgctgagttatcagctgtgaacgacattctggcgtcta',\n", + " '\\t\\tcgaacgagtcaatcagaccgctttgactctggtattactgtgaacattattcgtctc',\n", + " '\\t\\tcaatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaactatatgcg',\n", + " '\\t\\tttgacctactacgccagcattttggcggtgtaagctaaccattccggttgactcaat',\n", + " '\\t\\tcgtctatcggtgaacctccggtatcaacgctggaaggtgacgctaacgcagatgcag',\n", + " '\\t\\tgccaatcaatcaagaacttgaagggtggtatcagccaacagcctgacatccttcgtt',\n", + " '\\t\\ttggatggacgttcaacattgaggaaggcataacgctactacctgatgtttactccaa',\n", + " '\\t\\tgaggtggctatgtgtatgaccgaacgagtcaatcagaccgctttgactctggtatta',\n", + " '\\t\\tcgtagcgcatcagtgctttcttactgtgagtacgcaccagcgccagaggacgacgac',\n", + " '\\t\\tcgaccgaagcgagcctcgtcctcaatggcctctaaacgggtcttgaggggttttttg',\n", + " '\\t\\tctacggtgggtacaatatgctggatggagatgcgttcacttctggtctactgactcg',\n", + " '\\t\\tatagtctcagagtcttgacctactacgccagcattttggcggtgtaagctaaccatt',\n", + " '\\t\\taactcaaggctgatacggcgagacttgcgagccttgtccttgcggtacacagcagcg',\n", + " '\\t\\tttactgtgaacattattcgtctccgcgactacgatgagatgcctgagtgcttccgtt',\n", + " '\\t\\ttattctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattga',\n", + " '\\t\\taacgagtcaatcagaccgctttgactctggtattactgtgaacattattcgtctccg',\n", + " '\\t\\taagtgcttagcttcaaggtcacggatacgaccgaagcgagcctcgtcctcaatggcc',\n", + " '\\t\\tgaagaccacgcctcgccaccgagtagacccttagagagcatgtcagcctcgacaact',\n", + " '\\t\\tttagagagcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccct',\n", + " '\\t\\ttattcgtctccgcgactacgatgagatgcctgagtgcttccgttactggattgtcac',\n", + " '\\t\\ttgctgaaaggaggaactatatgcgctcatacgatatgaacgttgagactgccgctga',\n", + " '\\t\\tcatgaactcaaggctgatacggcgagacttgcgagccttgtccttgcggtacacagc',\n", + " '\\t\\tttcgtctccgcgactacgatgagatgcctgagtgcttccgttactggattgtcacca',\n", + " '\\t\\tcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccctacgcgctt',\n", + " '\\t\\taggaggaactacgcaaggttggaacatcggagagatgccagccagcgcacctgcacg',\n", + " '\\t\\ttctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattgagga',\n", + " '\\t\\ttgaagtgcttagcttcaaggtcacggatacgaccgaagcgagcctcgtcctcaatgg',\n", + " '\\t\\tctatatgcgctcatacgatatgaacgttgagactgccgctgagttatcagctgtgaa',\n", + " '\\t\\tgcggcagcacgtttccacgcggtgagagcctcaggattcatgtcgatgtcttccggt',\n", + " '\\t\\tatccctaatgtctacttccggtcaatccatctacgttaaccgaggtggctatgtgta',\n", + " '\\t\\ttggcgtctatcggtgaacctccggtatcaacgctggaaggtgacgctaacgcagatg',\n", + " '\\t\\ttctcgtggatggacgttcaacattgaggaaggcataacgctactacctgatgtttac',\n", + " '\\t\\ttattggcttgctcaagcatgaactcaaggctgatacggcgagacttgcgagccttgt',\n", + " '\\t\\ttagagggtgtactccaagaagaggaagatgaggctagacgtctctgcatggagtatg',\n", + " '\\t\\tcagcggcagcacgtttccacgcggtgagagcctcaggattcatgtcgatgtcttccg',\n", + " '\\t\\tttacgttggcgaccgctaggactttcttgttgattttccatgcggtgttttgcgcaa',\n", + " '\\t\\tacgctaacgcagatgcagcgaacgctcggcgtattctcaacaagattaaccgacaga',\n", + " '\\t\\tggtgttttgcgcaatgttaatcgctttgtacacctcaggcatgtaaacgtcttcgta',\n", + " '\\t\\taaccattccggttgactcaatgagcatctcgatgcagcgtactcctacatgaataga',\n", + " '\\t\\tagacgtctctgcatggagtatgagatggactacggtgggtacaatatgctggatgga',\n", + " '\\t\\ttgttgattttccatgcggtgttttgcgcaatgttaatcgctttgtacacctcaggca',\n", + " '\\t\\ttgcacgggttgcgatagcctcagcgtattcaggtgcgagttcgatagtctcagagtc',\n", + " '\\t\\taggcatgtaaacgtcttcgtagcgcatcagtgctttcttactgtgagtacgcaccag',\n", + " '\\t\\tccgagtagacccttagagagcatgtcagcctcgacaacttgcataaatgctttcttg',\n", + " '\\t\\tcgctaggactttcttgttgattttccatgcggtgttttgcgcaatgttaatcgcttt',\n", + " '\\t\\ttatgaccgaacgagtcaatcagaccgctttgactctggtattactgtgaacattatt',\n", + " '\\t\\tagagggtgtactccaagaagaggaagatgaggctagacgtctctgcatggagtatga',\n", + " '\\t\\tgagagcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccctacg',\n", + " '\\t\\tcctcaatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaactatat',\n", + " '\\t\\tgtattctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattg',\n", + " '\\t\\tcgcgactacgatgagatgcctgagtgcttccgttactggattgtcaccaaggcttcc',\n", + " '\\t\\tctcgtcctcaatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaac',\n", + " '\\t\\ttaacattaataaataaggaggctctaatggcactcattagccaatcaatcaagaact']" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ], + "source": [ + "# generate list of DNA sequence\n", + "sequence = list(data.loc[:, 'Sequence'])\n", + "sequence" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rhSkfaKO6Pfr", + "outputId": "a9afc6e9-690e-40c8-e217-f11764534d5e" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['t',\n", + " 'a',\n", + " 'c',\n", + " 't',\n", + " 'a',\n", + " 'g',\n", + " 'c',\n", + " 'a',\n", + " 'a',\n", + " 't',\n", + " 'a',\n", + " 'c',\n", + " 'g',\n", + " 'c',\n", + " 't',\n", + " 't',\n", + " 'g',\n", + " 'c',\n", + " 'g',\n", + " 't',\n", + " 't',\n", + " 'c',\n", + " 'g',\n", + " 'g',\n", + " 't',\n", + " 'g',\n", + " 'g',\n", + " 't',\n", + " 't',\n", + " 'a',\n", + " 'a',\n", + " 'g',\n", + " 't',\n", + " 'a',\n", + " 't',\n", + " 'g',\n", + " 't',\n", + " 'a',\n", + " 't',\n", + " 'a',\n", + " 'a',\n", + " 't',\n", + " 'g',\n", + " 'c',\n", + " 'g',\n", + " 'c',\n", + " 'g',\n", + " 'g',\n", + " 'g',\n", + " 'c',\n", + " 't',\n", + " 't',\n", + " 'g',\n", + " 't',\n", + " 'c',\n", + " 'g',\n", + " 't',\n", + " '+']" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ], + "source": [ + "#Remove tab from each sequence\n", + "dic = {}\n", + "for i, seq in enumerate(sequence):\n", + " nucleotides = list(seq)\n", + " nucleotides = [char for char in nucleotides if char != '\\t']\n", + " #append class assignment\n", + " nucleotides.append(clases[i])\n", + "\n", + " dic[i] = nucleotides\n", + "dic[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + }, + "id": "xtt11Jdi6Pfs", + "outputId": "5fb27c39-d2ff-4599-b407-e7b6ba45cbc2" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 ... 96 97 98 99 100 101 102 \\\n", + "0 t t g a t a c t c t ... c c t a g c g \n", + "1 a g t a c g a t g t ... c g a g a c t \n", + "2 c c a t g g g t a t ... g c t a g t a \n", + "3 t t c t a g g c c t ... a t g g a c t \n", + "4 a a t g t g g t t a ... g a a g g a t \n", + "\n", + " 103 104 105 \n", + "0 c c t \n", + "1 g t a \n", + "2 c c a \n", + "3 g g c \n", + "4 a t a \n", + "\n", + "[5 rows x 106 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...96979899100101102103104105
0ttgatactct...cctagcgcct
1agtacgatgt...cgagactgta
2ccatgggtat...gctagtacca
3ttctaggcct...atggactggc
4aatgtggtta...gaaggatata
\n", + "

5 rows × 106 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df" + } + }, + "metadata": {}, + "execution_count": 13 + } + ], + "source": [ + "# Convert Dict object into dataframe\n", + "df = pd.DataFrame(dic)\n", + "df.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + }, + "id": "Db8jOvEE6Pft", + "outputId": "050dc20a-6ba1-441e-a560-373cbe52b8c3" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 ... 48 49 50 51 52 53 54 55 56 57\n", + "0 t a c t a g c a a t ... g c t t g t c g t +\n", + "1 t g c t a t c c t g ... c a t c g c c a a +\n", + "2 g t a c t a g a g a ... c a c c c g g c g +\n", + "3 a a t t g t g a t g ... a a c a a a c t c +\n", + "4 t c g a t a a t t a ... c c g t g g t a g +\n", + "\n", + "[5 rows x 58 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...48495051525354555657
0tactagcaat...gcttgtcgt+
1tgctatcctg...catcgccaa+
2gtactagaga...cacccggcg+
3aattgtgatg...aacaaactc+
4tcgataatta...ccgtggtag+
\n", + "

5 rows × 58 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df" + } + }, + "metadata": {}, + "execution_count": 14 + } + ], + "source": [ + "# transpose dataframe into correct format\n", + "df = df.transpose()\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VO6oTsFo6Pfu", + "outputId": "52eb6610-e876-46e4-8aab-5dd287344501" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "RangeIndex(start=0, stop=58, step=1)" + ] + }, + "metadata": {}, + "execution_count": 15 + } + ], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "PGRI9uyc6Pfv" + }, + "outputs": [], + "source": [ + "# Rename\n", + "df.rename(columns = {57:'Class'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QqQVl-wY6Pfw", + "outputId": "b5241da6-014b-4d2f-fee9-97908db79712" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Index([ 0, 1, 2, 3, 4, 5, 6, 7,\n", + " 8, 9, 10, 11, 12, 13, 14, 15,\n", + " 16, 17, 18, 19, 20, 21, 22, 23,\n", + " 24, 25, 26, 27, 28, 29, 30, 31,\n", + " 32, 33, 34, 35, 36, 37, 38, 39,\n", + " 40, 41, 42, 43, 44, 45, 46, 47,\n", + " 48, 49, 50, 51, 52, 53, 54, 55,\n", + " 56, 'Class'],\n", + " dtype='object')" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + }, + "id": "hbHhNFpH6Pfx", + "outputId": "196821ab-7fbc-4089-c951-03106e15e353" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 ... 48 49 50 51 52 53 54 55 56 Class\n", + "0 t a c t a g c a a t ... g c t t g t c g t +\n", + "1 t g c t a t c c t g ... c a t c g c c a a +\n", + "2 g t a c t a g a g a ... c a c c c g g c g +\n", + "3 a a t t g t g a t g ... a a c a a a c t c +\n", + "4 t c g a t a a t t a ... c c g t g g t a g +\n", + "\n", + "[5 rows x 58 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...484950515253545556Class
0tactagcaat...gcttgtcgt+
1tgctatcctg...catcgccaa+
2gtactagaga...cacccggcg+
3aattgtgatg...aacaaactc+
4tcgataatta...ccgtggtag+
\n", + "

5 rows × 58 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df" + } + }, + "metadata": {}, + "execution_count": 18 + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + }, + "id": "C-J6FSJI6Pfx", + "outputId": "af18f038-b045-4df1-c205-12f3d8b1318f" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " 0_a 0_c 0_g 0_t 1_a 1_c 1_g 1_t 2_a 2_c ... \\\n", + "0 False False False True True False False False False True ... \n", + "1 False False False True False False True False False True ... \n", + "2 False False True False False False False True True False ... \n", + "3 True False False False True False False False False False ... \n", + "4 False False False True False True False False False False ... \n", + "\n", + " 55_a 55_c 55_g 55_t 56_a 56_c 56_g 56_t Class_+ Class_- \n", + "0 False False True False False False False True True False \n", + "1 True False False False True False False False True False \n", + "2 False True False False False False True False True False \n", + "3 False False False True False True False False True False \n", + "4 True False False False False False True False True False \n", + "\n", + "[5 rows x 230 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0_a0_c0_g0_t1_a1_c1_g1_t2_a2_c...55_a55_c55_g55_t56_a56_c56_g56_tClass_+Class_-
0FalseFalseFalseTrueTrueFalseFalseFalseFalseTrue...FalseFalseTrueFalseFalseFalseFalseTrueTrueFalse
1FalseFalseFalseTrueFalseFalseTrueFalseFalseTrue...TrueFalseFalseFalseTrueFalseFalseFalseTrueFalse
2FalseFalseTrueFalseFalseFalseFalseTrueTrueFalse...FalseTrueFalseFalseFalseFalseTrueFalseTrueFalse
3TrueFalseFalseFalseTrueFalseFalseFalseFalseFalse...FalseFalseFalseTrueFalseTrueFalseFalseTrueFalse
4FalseFalseFalseTrueFalseTrueFalseFalseFalseFalse...TrueFalseFalseFalseFalseFalseTrueFalseTrueFalse
\n", + "

5 rows × 230 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "numerical_df" + } + }, + "metadata": {}, + "execution_count": 19 + } + ], + "source": [ + "#Encoding\n", + "numerical_df = pd.get_dummies(df)\n", + "numerical_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + }, + "id": "NtOIsbv06Pfy", + "outputId": "3dae252b-f091-46b5-ec1c-1ab44f766c73" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " 0_a 0_c 0_g 0_t 1_a 1_c 1_g 1_t 2_a 2_c ... \\\n", + "0 False False False True True False False False False True ... \n", + "1 False False False True False False True False False True ... \n", + "2 False False True False False False False True True False ... \n", + "3 True False False False True False False False False False ... \n", + "4 False False False True False True False False False False ... \n", + "\n", + " 54_t 55_a 55_c 55_g 55_t 56_a 56_c 56_g 56_t Class_+ \n", + "0 False False False True False False False False True True \n", + "1 False True False False False True False False False True \n", + "2 False False True False False False False True False True \n", + "3 False False False False True False True False False True \n", + "4 True True False False False False False True False True \n", + "\n", + "[5 rows x 229 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0_a0_c0_g0_t1_a1_c1_g1_t2_a2_c...54_t55_a55_c55_g55_t56_a56_c56_g56_tClass_+
0FalseFalseFalseTrueTrueFalseFalseFalseFalseTrue...FalseFalseFalseTrueFalseFalseFalseFalseTrueTrue
1FalseFalseFalseTrueFalseFalseTrueFalseFalseTrue...FalseTrueFalseFalseFalseTrueFalseFalseFalseTrue
2FalseFalseTrueFalseFalseFalseFalseTrueTrueFalse...FalseFalseTrueFalseFalseFalseFalseTrueFalseTrue
3TrueFalseFalseFalseTrueFalseFalseFalseFalseFalse...FalseFalseFalseFalseTrueFalseTrueFalseFalseTrue
4FalseFalseFalseTrueFalseTrueFalseFalseFalseFalse...TrueTrueFalseFalseFalseFalseFalseTrueFalseTrue
\n", + "

5 rows × 229 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "numerical_df" + } + }, + "metadata": {}, + "execution_count": 20 + } + ], + "source": [ + "# Drop class_- or Class_+ either of one\n", + "numerical_df.drop('Class_-', axis = 1, inplace = True)\n", + "numerical_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "id": "iGyUCF226Pfz" + }, + "outputs": [], + "source": [ + "# rename Class_+ to Class\n", + "numerical_df.rename(columns = {'Class_+':'Class'}, inplace = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cj76tLTB6Pfz" + }, + "source": [ + "## Step 3: Training and Testing the Classification Algorithms\n", + "\n", + "Now that we have preprocessed the data and built our training and testing datasets, we can start to deploy different classification algorithms. It's relatively easy to test multiple models; as a result, we will compare and contrast the performance of ten different algorithms." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "id": "JSSgxqzG6Pf0" + }, + "outputs": [], + "source": [ + "#Importing different classifier from sklearn\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn import svm\n", + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.gaussian_process.kernels import RBF\n", + "from sklearn.gaussian_process import GaussianProcessClassifier\n", + "from sklearn.neural_network import MLPClassifier\n", + "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n", + "from sklearn.metrics import classification_report, accuracy_score" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "id": "F2yJxt9S6Pf0" + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "X = numerical_df.drop(['Class'], axis = 1).values\n", + "y = numerical_df['Class'].values\n", + "\n", + "#define a seed for reproducibility\n", + "seed = 1\n", + "\n", + "# Splitting data into training and testing data\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = seed)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2KLQMlHW6Pf0", + "outputId": "417ba0b4-4494-4bfb-da56-fc099e1cfa84" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "K Nearest Neighbors: 0.8107142857142857 (0.099808490089158)\n", + "Gaussian Process: 0.8553571428571429 (0.1606051216556957)\n", + "Decision Tree: 0.6821428571428572 (0.1316593470797626)\n", + "Random Forest: 0.5714285714285714 (0.19232731454051802)\n", + "Neural Net: 0.9125 (0.09762812094883318)\n", + "AdaBoost: 0.8625 (0.14197270864500683)\n", + "Naive Bayes: 0.8375 (0.1125)\n", + "SVM Linear: 0.9125 (0.09762812094883318)\n", + "SVM RBF: 0.875 (0.11180339887498948)\n", + "SVM Sigmoid: 0.925 (0.1)\n" + ] + } + ], + "source": [ + "# Define scoring method\n", + "scoring = 'accuracy'\n", + "# Model building to train\n", + "names = ['K Nearest Neighbors', 'Gaussian Process', 'Decision Tree', 'Random Forest', 'Neural Net', 'AdaBoost', 'Naive Bayes', 'SVM Linear', 'SVM RBF', 'SVM Sigmoid']\n", + "Classifiers = [\n", + " KNeighborsClassifier(n_neighbors=3),\n", + " GaussianProcessClassifier(1.0*RBF(1.0)),\n", + " DecisionTreeClassifier(max_depth=5),\n", + " RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),\n", + " MLPClassifier(alpha=1),\n", + " AdaBoostClassifier(),\n", + " GaussianNB(),\n", + " svm.SVC(kernel='linear'),\n", + " svm.SVC(kernel='rbf'),\n", + " svm.SVC(kernel='sigmoid')\n", + "]\n", + "models = zip(names, Classifiers)\n", + "\n", + "# import KFold\n", + "from sklearn.model_selection import KFold, cross_val_score\n", + "\n", + "names = []\n", + "result = []\n", + "for name, model in models:\n", + " kfold = KFold(n_splits=10, shuffle=True, random_state=1)\n", + " cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')\n", + " result.append(cv_results)\n", + " names.append(name)\n", + " msg = \"{0}: {1} ({2})\".format(name, cv_results.mean(), cv_results.std())\n", + " print(msg)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "misVAgvl6Pf1" + }, + "source": [ + "## Step 4 : Model Evaluation\n", + "\n", + "Now that we will evaluate our classification algorithms using accuracy score and classification report." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IwbJR7Nz6Pf2", + "outputId": "fe65867f-a939-4a32-d58d-dbfdd1437441" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "K Nearest Neighbors\n", + "0.7777777777777778\n", + " precision recall f1-score support\n", + "\n", + " False 1.00 0.65 0.79 17\n", + " True 0.62 1.00 0.77 10\n", + "\n", + " accuracy 0.78 27\n", + " macro avg 0.81 0.82 0.78 27\n", + "weighted avg 0.86 0.78 0.78 27\n", + "\n", + "Gaussian Process\n", + "0.8888888888888888\n", + " precision recall f1-score support\n", + "\n", + " False 1.00 0.82 0.90 17\n", + " True 0.77 1.00 0.87 10\n", + "\n", + " accuracy 0.89 27\n", + " macro avg 0.88 0.91 0.89 27\n", + "weighted avg 0.91 0.89 0.89 27\n", + "\n", + "Decision Tree\n", + "0.7407407407407407\n", + " precision recall f1-score support\n", + "\n", + " False 1.00 0.59 0.74 17\n", + " True 0.59 1.00 0.74 10\n", + "\n", + " accuracy 0.74 27\n", + " macro avg 0.79 0.79 0.74 27\n", + "weighted avg 0.85 0.74 0.74 27\n", + "\n", + "Random Forest\n", + "0.4444444444444444\n", + " precision recall f1-score support\n", + "\n", + " False 0.67 0.24 0.35 17\n", + " True 0.38 0.80 0.52 10\n", + "\n", + " accuracy 0.44 27\n", + " macro avg 0.52 0.52 0.43 27\n", + "weighted avg 0.56 0.44 0.41 27\n", + "\n", + "Neural Net\n", + "0.8888888888888888\n", + " precision recall f1-score support\n", + "\n", + " False 1.00 0.82 0.90 17\n", + " True 0.77 1.00 0.87 10\n", + "\n", + " accuracy 0.89 27\n", + " macro avg 0.88 0.91 0.89 27\n", + "weighted avg 0.91 0.89 0.89 27\n", + "\n", + "AdaBoost\n", + "0.8518518518518519\n", + " precision recall f1-score support\n", + "\n", + " False 1.00 0.76 0.87 17\n", + " True 0.71 1.00 0.83 10\n", + "\n", + " accuracy 0.85 27\n", + " macro avg 0.86 0.88 0.85 27\n", + "weighted avg 0.89 0.85 0.85 27\n", + "\n", + "Naive Bayes\n", + "0.9259259259259259\n", + " precision recall f1-score support\n", + "\n", + " False 1.00 0.88 0.94 17\n", + " True 0.83 1.00 0.91 10\n", + "\n", + " accuracy 0.93 27\n", + " macro avg 0.92 0.94 0.92 27\n", + "weighted avg 0.94 0.93 0.93 27\n", + "\n", + "SVM Linear\n", + "0.9629629629629629\n", + " precision recall f1-score support\n", + "\n", + " False 1.00 0.94 0.97 17\n", + " True 0.91 1.00 0.95 10\n", + "\n", + " accuracy 0.96 27\n", + " macro avg 0.95 0.97 0.96 27\n", + "weighted avg 0.97 0.96 0.96 27\n", + "\n", + "SVM RBF\n", + "0.9259259259259259\n", + " precision recall f1-score support\n", + "\n", + " False 1.00 0.88 0.94 17\n", + " True 0.83 1.00 0.91 10\n", + "\n", + " accuracy 0.93 27\n", + " macro avg 0.92 0.94 0.92 27\n", + "weighted avg 0.94 0.93 0.93 27\n", + "\n", + "SVM Sigmoid\n", + "0.9259259259259259\n", + " precision recall f1-score support\n", + "\n", + " False 1.00 0.88 0.94 17\n", + " True 0.83 1.00 0.91 10\n", + "\n", + " accuracy 0.93 27\n", + " macro avg 0.92 0.94 0.92 27\n", + "weighted avg 0.94 0.93 0.93 27\n", + "\n" + ] + } + ], + "source": [ + "#Test the algorithm on the test data set\n", + "models = zip(names, Classifiers)\n", + "for name, model in models:\n", + " model.fit(X_train, y_train)\n", + " y_pred = model.predict(X_test)\n", + " print(name)\n", + " print(accuracy_score(y_test, y_pred))\n", + " print(classification_report(y_test, y_pred))\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lv6YkrVY6Pf2" + }, + "source": [ + "## Conclusion :" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "av-mepng6Pf3" + }, + "source": [ + "#### From above report, Support Vector Machine with 'linear' kernel performed best with F1_score = 0.96 on testing data." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "colab": { + "provenance": [] + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/DNA CLASSIFICATION/promoters.data b/DNA CLASSIFICATION/promoters.data new file mode 100644 index 00000000..ec211eac --- /dev/null +++ b/DNA CLASSIFICATION/promoters.data @@ -0,0 +1,106 @@ ++,S10, tactagcaatacgcttgcgttcggtggttaagtatgtataatgcgcgggcttgtcgt ++,AMPC, tgctatcctgacagttgtcacgctgattggtgtcgttacaatctaacgcatcgccaa ++,AROH, gtactagagaactagtgcattagcttatttttttgttatcatgctaaccacccggcg ++,DEOP2, aattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaatactaacaaactc ++,LEU1_TRNA, tcgataattaactattgacgaaaagctgaaaaccactagaatgcgcctccgtggtag ++,MALEFG, aggggcaaggaggatggaaagaggttgccgtataaagaaactagagtccgtttaggt ++,MALK, cagggggtggaggatttaagccatctcctgatgacgcatagtcagcccatcatgaat ++,RECA, tttctacaaaacacttgatactgtatgagcatacagtataattgcttcaacagaaca ++,RPOB, cgacttaatatactgcgacaggacgtccgttctgtgtaaatcgcaatgaaatggttt ++,RRNAB_P1, ttttaaatttcctcttgtcaggccggaataactccctataatgcgccaccactgaca ++,RRNAB_P2, gcaaaaataaatgcttgactctgtagcgggaaggcgtattatgcacaccccgcgccg ++,RRNDEX_P2, cctgaaattcagggttgactctgaaagaggaaagcgtaatatacgccacctcgcgac ++,RRND_P1, gatcaaaaaaatacttgtgcaaaaaattgggatccctataatgcgcctccgttgaga ++,RRNE_P1, ctgcaatttttctattgcggcctgcggagaactccctataatgcgcctccatcgaca ++,RRNG_P1, tttatatttttcgcttgtcaggccggaataactccctataatgcgccaccactgaca ++,RRNG_P2, aagcaaagaaatgcttgactctgtagcgggaaggcgtattatgcacaccgccgcgcc ++,RRNX_P1, atgcatttttccgcttgtcttcctgagccgactccctataatgcgcctccatcgaca ++,TNAA, aaacaatttcagaatagacaaaaactctgagtgtaataatgtagcctcgtgtcttgc ++,TYRT, tctcaacgtaacactttacagcggcgcgtcatttgatatgatgcgccccgcttcccg ++,ARAC, gcaaataatcaatgtggacttttctgccgtgattatagacacttttgttacgcgttt ++,LACI, gacaccatcgaatggcgcaaaacctttcgcggtatggcatgatagcgcccggaagag ++,MALT, aaaaacgtcatcgcttgcattagaaaggtttctggccgaccttataaccattaatta ++,TRP, tctgaaatgagctgttgacaattaatcatcgaactagttaactagtacgcaagttca ++,TRPP2, accggaagaaaaccgtgacattttaacacgtttgttacaaggtaaaggcgacgccgc ++,THR, aaattaaaattttattgacttaggtcactaaatactttaaccaatataggcatagcg ++,BIOB, ttgtcataatcgacttgtaaaccaaattgaaaagatttaggtttacaagtctacacc ++,FOL, catcctcgcaccagtcgacgacggtttacgctttacgtatagtggcgacaatttttt ++,UVRBP1, tccagtataatttgttggcataattaagtacgacgagtaaaattacatacctgcccg ++,UVRBP3, acagttatccactattcctgtggataaccatgtgtattagagttagaaaacacgagg ++,LEXA, tgtgcagtttatggttccaaaatcgccttttgctgtatatactcacagcataactgt ++,PORI-L, ctgttgttcagtttttgagttgtgtataacccctcattctgatcccagcttatacgg ++,SPOT42, attacaaaaagtgctttctgaactgaacaaaaaagagtaaagttagtcgcgtagggt ++,M1RNA, atgcgcaacgcggggtgacaagggcgcgcaaaccctctatactgcgcgccgaagctg ++,GLNS, taaaaaactaacagttgtcagcctgtcccgcttataagatcatacgccgttatacgt ++,TUFB, atgcaattttttagttgcatgaactcgcatgtctccatagaatgcgcgctacttgat ++,SUBB-E, ccttgaaaaagaggttgacgctgcaaggctctatacgcataatgcgccccgcaacgc ++,STR, tcgttgtatatttcttgacaccttttcggcatcgccctaaaattcggcgtcctcata ++,SPC, ccgtttattttttctacccatatccttgaagcggtgttataatgccgcgccctcgat ++,RPOA, ttcgcatatttttcttgcaaagttgggttgagctggctagattagccagccaatctt ++,RPLJ, tgtaaactaatgcctttacgtgggcggtgattttgtctacaatcttacccccacgta ++,PORI-R, gatcgcacgatctgtatacttatttgagtaaattaacccacgatcccagccattctt ++,ALAS, aacgcatacggtattttaccttcccagtcaagaaaacttatcttattcccacttttc ++,ARABAD, ttagcggatcctacctgacgctttttatcgcaactctctactgtttctccatacccg ++,BIOA, gccttctccaaaacgtgttttttgttgttaattcggtgtagacttgtaaacctaaat ++,DEOP1, cagaaacgttttattcgaacatcgatctcgtcttgtgttagaattctaacatacggt ++,GALP2, cactaatttattccatgtcacacttttcgcatctttgttatgctatggttatttcat ++,HIS, atataaaaaagttcttgctttctaacgtgaaagtggtttaggttaaaagacatcagt ++,HISJ, caaggtagaatgctttgccttgtcggcctgattaatggcacgatagtcgcatcggat ++,ILVGEDA, ggccaaaaaatatcttgtactatttacaaaacctatggtaactctttaggcattcct ++,LACP1, taggcaccccaggctttacactttatgcttccggctcgtatgttgtgtggaattgtg ++,LPP, ccatcaaaaaaatattctcaacataaaaaactttgtgtaatacttgtaacgctacat ++,TRPR, tggggacgtcgttactgatccgcacgtttatgatatgctatcgtactctttagcgag ++,UVRB_P2, tcagaaatattatggtgatgaactgtttttttatccagtataatttgttggcataat +-, 867, atatgaacgttgagactgccgctgagttatcagctgtgaacgacattctggcgtcta +-,1169, cgaacgagtcaatcagaccgctttgactctggtattactgtgaacattattcgtctc +-, 802, caatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaactatatgcg +-, 521, ttgacctactacgccagcattttggcggtgtaagctaaccattccggttgactcaat +-, 918, cgtctatcggtgaacctccggtatcaacgctggaaggtgacgctaacgcagatgcag +-,1481, gccaatcaatcaagaacttgaagggtggtatcagccaacagcctgacatccttcgtt +-,1024, tggatggacgttcaacattgaggaaggcataacgctactacctgatgtttactccaa +-,1149, gaggtggctatgtgtatgaccgaacgagtcaatcagaccgctttgactctggtatta +-, 313, cgtagcgcatcagtgctttcttactgtgagtacgcaccagcgccagaggacgacgac +-, 780, cgaccgaagcgagcctcgtcctcaatggcctctaaacgggtcttgaggggttttttg +-,1384, ctacggtgggtacaatatgctggatggagatgcgttcacttctggtctactgactcg +-, 507, atagtctcagagtcttgacctactacgccagcattttggcggtgtaagctaaccatt +-, 39, aactcaaggctgatacggcgagacttgcgagccttgtccttgcggtacacagcagcg +-,1203, ttactgtgaacattattcgtctccgcgactacgatgagatgcctgagtgcttccgtt +-, 988, tattctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattga +-,1171, aacgagtcaatcagaccgctttgactctggtattactgtgaacattattcgtctccg +-, 753, aagtgcttagcttcaaggtcacggatacgaccgaagcgagcctcgtcctcaatggcc +-, 630, gaagaccacgcctcgccaccgagtagacccttagagagcatgtcagcctcgacaact +-, 660, ttagagagcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccct +-,1216, tattcgtctccgcgactacgatgagatgcctgagtgcttccgttactggattgtcac +-, 835, tgctgaaaggaggaactatatgcgctcatacgatatgaacgttgagactgccgctga +-, 35, catgaactcaaggctgatacggcgagacttgcgagccttgtccttgcggtacacagc +-,1218, ttcgtctccgcgactacgatgagatgcctgagtgcttccgttactggattgtcacca +-, 668, catgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccctacgcgctt +-, 413, aggaggaactacgcaaggttggaacatcggagagatgccagccagcgcacctgcacg +-, 991, tctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattgagga +-, 751, tgaagtgcttagcttcaaggtcacggatacgaccgaagcgagcctcgtcctcaatgg +-, 850, ctatatgcgctcatacgatatgaacgttgagactgccgctgagttatcagctgtgaa +-, 93, gcggcagcacgtttccacgcggtgagagcctcaggattcatgtcgatgtcttccggt +-,1108, atccctaatgtctacttccggtcaatccatctacgttaaccgaggtggctatgtgta +-, 915, tggcgtctatcggtgaacctccggtatcaacgctggaaggtgacgctaacgcagatg +-,1019, tctcgtggatggacgttcaacattgaggaaggcataacgctactacctgatgtttac +-, 19, tattggcttgctcaagcatgaactcaaggctgatacggcgagacttgcgagccttgt +-,1320, tagagggtgtactccaagaagaggaagatgaggctagacgtctctgcatggagtatg +-, 91, cagcggcagcacgtttccacgcggtgagagcctcaggattcatgtcgatgtcttccg +-, 217, ttacgttggcgaccgctaggactttcttgttgattttccatgcggtgttttgcgcaa +-, 957, acgctaacgcagatgcagcgaacgctcggcgtattctcaacaagattaaccgacaga +-, 260, ggtgttttgcgcaatgttaatcgctttgtacacctcaggcatgtaaacgtcttcgta +-, 557, aaccattccggttgactcaatgagcatctcgatgcagcgtactcctacatgaataga +-,1355, agacgtctctgcatggagtatgagatggactacggtgggtacaatatgctggatgga +-, 244, tgttgattttccatgcggtgttttgcgcaatgttaatcgctttgtacacctcaggca +-, 464, tgcacgggttgcgatagcctcagcgtattcaggtgcgagttcgatagtctcagagtc +-, 296, aggcatgtaaacgtcttcgtagcgcatcagtgctttcttactgtgagtacgcaccag +-, 648, ccgagtagacccttagagagcatgtcagcctcgacaacttgcataaatgctttcttg +-, 230, cgctaggactttcttgttgattttccatgcggtgttttgcgcaatgttaatcgcttt +-,1163, tatgaccgaacgagtcaatcagaccgctttgactctggtattactgtgaacattatt +-,1321, agagggtgtactccaagaagaggaagatgaggctagacgtctctgcatggagtatga +-, 663, gagagcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccctacg +-, 799, cctcaatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaactatat +-, 987, gtattctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattg +-,1226, cgcgactacgatgagatgcctgagtgcttccgttactggattgtcaccaaggcttcc +-, 794, ctcgtcctcaatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaac +-,1442, taacattaataaataaggaggctctaatggcactcattagccaatcaatcaagaact