From c3cfe9c8f7a2f784bebb2080dd010196430e307e Mon Sep 17 00:00:00 2001
From: Pavitraa G <100479594+pavitraag@users.noreply.github.com>
Date: Wed, 31 Jul 2024 14:08:40 +0530
Subject: [PATCH] Add files via upload
---
.../DNA_Classification_Code.ipynb | 3293 +++++++++++++++++
DNA CLASSIFICATION/promoters.data | 106 +
2 files changed, 3399 insertions(+)
create mode 100644 DNA CLASSIFICATION/DNA_Classification_Code.ipynb
create mode 100644 DNA CLASSIFICATION/promoters.data
diff --git a/DNA CLASSIFICATION/DNA_Classification_Code.ipynb b/DNA CLASSIFICATION/DNA_Classification_Code.ipynb
new file mode 100644
index 00000000..0d49afca
--- /dev/null
+++ b/DNA CLASSIFICATION/DNA_Classification_Code.ipynb
@@ -0,0 +1,3293 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "QkgQB_-a6PfP"
+ },
+ "source": [
+ "# DNA Classification Using Machine Learning"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "5cJqFevm6PfW"
+ },
+ "source": [
+ "## About :\n",
+ "In this project, we will explore the world of bioinformatics by using Markov models, K-nearest neighbor (KNN) algorithms, support vector machines, and other common classifiers to classify short E. Coli DNA sequences. This project will use a dataset from the UCI Machine Learning Repository that has 106 DNA sequences, with 57 sequential nucleotides (“base-pairs”) each.\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "It includes :\n",
+ "
\n",
+ " Importing data from the UCI repository \n",
+ " Converting text inputs to numerical data \n",
+ " Building and training classification algorithms \n",
+ " Comparing and contrasting classification algorithms \n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "id": "N4d5V74G6Pfa"
+ },
+ "outputs": [],
+ "source": [
+ "# Hide warnings\n",
+ "import warnings\n",
+ "warnings.simplefilter('ignore')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "UoHvr8t66Pfe"
+ },
+ "source": [
+ "## Step 1: Importing the Dataset\n",
+ "\n",
+ "The following code cells will import necessary libraries and import the dataset from the UCI repository as a Pandas DataFram"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "id": "njKpY80E6Pfg"
+ },
+ "outputs": [],
+ "source": [
+ "#import and change module name\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import pandas as pd\n",
+ "url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'\n",
+ "names = ['Class', 'id', 'Sequence']\n",
+ "data = pd.read_csv(url, names = names)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "SZlIvNHq6Pfh",
+ "outputId": "c3ad494b-07cb-4d4b-b7cb-061d331b6109"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Index(['Class', 'id', 'Sequence'], dtype='object')"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 6
+ }
+ ],
+ "source": [
+ "data.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "v3Ffs5eT6Pfk",
+ "outputId": "6a6e13da-3e36-4876-e7fb-d796c925f1ac"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Class id Sequence\n",
+ "0 + S10 \\t\\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...\n",
+ "1 + AMPC \\t\\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...\n",
+ "2 + AROH \\t\\tgtactagagaactagtgcattagcttatttttttgttatcat...\n",
+ "3 + DEOP2 \\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...\n",
+ "4 + LEU1_TRNA \\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc..."
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Class \n",
+ " id \n",
+ " Sequence \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " + \n",
+ " S10 \n",
+ " \\t\\ttactagcaatacgcttgcgttcggtggttaagtatgtataat... \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " + \n",
+ " AMPC \n",
+ " \\t\\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat... \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " + \n",
+ " AROH \n",
+ " \\t\\tgtactagagaactagtgcattagcttatttttttgttatcat... \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " + \n",
+ " DEOP2 \n",
+ " \\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa... \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " + \n",
+ " LEU1_TRNA \n",
+ " \\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc... \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "data",
+ "summary": "{\n \"name\": \"data\",\n \"rows\": 106,\n \"fields\": [\n {\n \"column\": \"Class\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"-\",\n \"+\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 106,\n \"samples\": [\n \" 663\",\n \"RRNAB_P2\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Sequence\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 106,\n \"samples\": [\n \"\\t\\tgagagcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccctacg\",\n \"\\tgcaaaaataaatgcttgactctgtagcgggaaggcgtattatgcacaccccgcgccg\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 7
+ }
+ ],
+ "source": [
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "1nnvISj56Pfl",
+ "outputId": "b15d7b8f-929d-44aa-882d-de53014016c7"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(106, 3)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 8
+ }
+ ],
+ "source": [
+ "data.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 178
+ },
+ "id": "DpVVtx1T6Pfn",
+ "outputId": "4f00a6be-63dd-40c6-de73-e75a11496c0a"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Class object\n",
+ "id object\n",
+ "Sequence object\n",
+ "dtype: object"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Class \n",
+ " object \n",
+ " \n",
+ " \n",
+ " id \n",
+ " object \n",
+ " \n",
+ " \n",
+ " Sequence \n",
+ " object \n",
+ " \n",
+ " \n",
+ "
dtype: object "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 9
+ }
+ ],
+ "source": [
+ "data.dtypes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "NbqALsWn6Pfo"
+ },
+ "source": [
+ "## Step 2: Preprocessing the Dataset\n",
+ "\n",
+ "The data is not in a usable form; as a result, we will need to process it before using it to train our algorithms."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 240
+ },
+ "id": "X9RgQSv86Pfp",
+ "outputId": "fa3ed198-56ce-4c91-ab1a-69edccc3f14c"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0 +\n",
+ "1 +\n",
+ "2 +\n",
+ "3 +\n",
+ "4 +\n",
+ "Name: Class, dtype: object"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " Class \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " + \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " + \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " + \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " + \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " + \n",
+ " \n",
+ " \n",
+ "
dtype: object "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 10
+ }
+ ],
+ "source": [
+ "# Build our dataset using custom pandas dataframe\n",
+ "clases = data.loc[:,'Class']\n",
+ "clases.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "WIRgNNAN6Pfq",
+ "outputId": "596a7289-d5d8-419e-a0f3-aeb156a5cb20"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "['\\t\\ttactagcaatacgcttgcgttcggtggttaagtatgtataatgcgcgggcttgtcgt',\n",
+ " '\\t\\ttgctatcctgacagttgtcacgctgattggtgtcgttacaatctaacgcatcgccaa',\n",
+ " '\\t\\tgtactagagaactagtgcattagcttatttttttgttatcatgctaaccacccggcg',\n",
+ " '\\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaatactaacaaactc',\n",
+ " '\\ttcgataattaactattgacgaaaagctgaaaaccactagaatgcgcctccgtggtag',\n",
+ " '\\taggggcaaggaggatggaaagaggttgccgtataaagaaactagagtccgtttaggt',\n",
+ " '\\t\\tcagggggtggaggatttaagccatctcctgatgacgcatagtcagcccatcatgaat',\n",
+ " '\\t\\ttttctacaaaacacttgatactgtatgagcatacagtataattgcttcaacagaaca',\n",
+ " '\\t\\tcgacttaatatactgcgacaggacgtccgttctgtgtaaatcgcaatgaaatggttt',\n",
+ " '\\tttttaaatttcctcttgtcaggccggaataactccctataatgcgccaccactgaca',\n",
+ " '\\tgcaaaaataaatgcttgactctgtagcgggaaggcgtattatgcacaccccgcgccg',\n",
+ " '\\tcctgaaattcagggttgactctgaaagaggaaagcgtaatatacgccacctcgcgac',\n",
+ " '\\tgatcaaaaaaatacttgtgcaaaaaattgggatccctataatgcgcctccgttgaga',\n",
+ " '\\tctgcaatttttctattgcggcctgcggagaactccctataatgcgcctccatcgaca',\n",
+ " '\\ttttatatttttcgcttgtcaggccggaataactccctataatgcgccaccactgaca',\n",
+ " '\\taagcaaagaaatgcttgactctgtagcgggaaggcgtattatgcacaccgccgcgcc',\n",
+ " '\\tatgcatttttccgcttgtcttcctgagccgactccctataatgcgcctccatcgaca',\n",
+ " '\\t\\taaacaatttcagaatagacaaaaactctgagtgtaataatgtagcctcgtgtcttgc',\n",
+ " '\\t\\ttctcaacgtaacactttacagcggcgcgtcatttgatatgatgcgccccgcttcccg',\n",
+ " '\\t\\tgcaaataatcaatgtggacttttctgccgtgattatagacacttttgttacgcgttt',\n",
+ " '\\t\\tgacaccatcgaatggcgcaaaacctttcgcggtatggcatgatagcgcccggaagag',\n",
+ " '\\t\\taaaaacgtcatcgcttgcattagaaaggtttctggccgaccttataaccattaatta',\n",
+ " '\\t\\ttctgaaatgagctgttgacaattaatcatcgaactagttaactagtacgcaagttca',\n",
+ " '\\taccggaagaaaaccgtgacattttaacacgtttgttacaaggtaaaggcgacgccgc',\n",
+ " '\\t\\taaattaaaattttattgacttaggtcactaaatactttaaccaatataggcatagcg',\n",
+ " '\\t\\tttgtcataatcgacttgtaaaccaaattgaaaagatttaggtttacaagtctacacc',\n",
+ " '\\t\\tcatcctcgcaccagtcgacgacggtttacgctttacgtatagtggcgacaatttttt',\n",
+ " '\\ttccagtataatttgttggcataattaagtacgacgagtaaaattacatacctgcccg',\n",
+ " '\\tacagttatccactattcctgtggataaccatgtgtattagagttagaaaacacgagg',\n",
+ " '\\t\\ttgtgcagtttatggttccaaaatcgccttttgctgtatatactcacagcataactgt',\n",
+ " '\\tctgttgttcagtttttgagttgtgtataacccctcattctgatcccagcttatacgg',\n",
+ " '\\tattacaaaaagtgctttctgaactgaacaaaaaagagtaaagttagtcgcgtagggt',\n",
+ " '\\tatgcgcaacgcggggtgacaagggcgcgcaaaccctctatactgcgcgccgaagctg',\n",
+ " '\\t\\ttaaaaaactaacagttgtcagcctgtcccgcttataagatcatacgccgttatacgt',\n",
+ " '\\t\\tatgcaattttttagttgcatgaactcgcatgtctccatagaatgcgcgctacttgat',\n",
+ " '\\tccttgaaaaagaggttgacgctgcaaggctctatacgcataatgcgccccgcaacgc',\n",
+ " '\\t\\ttcgttgtatatttcttgacaccttttcggcatcgccctaaaattcggcgtcctcata',\n",
+ " '\\t\\tccgtttattttttctacccatatccttgaagcggtgttataatgccgcgccctcgat',\n",
+ " '\\t\\tttcgcatatttttcttgcaaagttgggttgagctggctagattagccagccaatctt',\n",
+ " '\\t\\ttgtaaactaatgcctttacgtgggcggtgattttgtctacaatcttacccccacgta',\n",
+ " '\\tgatcgcacgatctgtatacttatttgagtaaattaacccacgatcccagccattctt',\n",
+ " '\\t\\taacgcatacggtattttaccttcccagtcaagaaaacttatcttattcccacttttc',\n",
+ " '\\tttagcggatcctacctgacgctttttatcgcaactctctactgtttctccatacccg',\n",
+ " '\\t\\tgccttctccaaaacgtgttttttgttgttaattcggtgtagacttgtaaacctaaat',\n",
+ " '\\tcagaaacgttttattcgaacatcgatctcgtcttgtgttagaattctaacatacggt',\n",
+ " '\\tcactaatttattccatgtcacacttttcgcatctttgttatgctatggttatttcat',\n",
+ " '\\t\\tatataaaaaagttcttgctttctaacgtgaaagtggtttaggttaaaagacatcagt',\n",
+ " '\\t\\tcaaggtagaatgctttgccttgtcggcctgattaatggcacgatagtcgcatcggat',\n",
+ " '\\tggccaaaaaatatcttgtactatttacaaaacctatggtaactctttaggcattcct',\n",
+ " '\\ttaggcaccccaggctttacactttatgcttccggctcgtatgttgtgtggaattgtg',\n",
+ " '\\t\\tccatcaaaaaaatattctcaacataaaaaactttgtgtaatacttgtaacgctacat',\n",
+ " '\\t\\ttggggacgtcgttactgatccgcacgtttatgatatgctatcgtactctttagcgag',\n",
+ " '\\ttcagaaatattatggtgatgaactgtttttttatccagtataatttgttggcataat',\n",
+ " '\\t\\tatatgaacgttgagactgccgctgagttatcagctgtgaacgacattctggcgtcta',\n",
+ " '\\t\\tcgaacgagtcaatcagaccgctttgactctggtattactgtgaacattattcgtctc',\n",
+ " '\\t\\tcaatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaactatatgcg',\n",
+ " '\\t\\tttgacctactacgccagcattttggcggtgtaagctaaccattccggttgactcaat',\n",
+ " '\\t\\tcgtctatcggtgaacctccggtatcaacgctggaaggtgacgctaacgcagatgcag',\n",
+ " '\\t\\tgccaatcaatcaagaacttgaagggtggtatcagccaacagcctgacatccttcgtt',\n",
+ " '\\t\\ttggatggacgttcaacattgaggaaggcataacgctactacctgatgtttactccaa',\n",
+ " '\\t\\tgaggtggctatgtgtatgaccgaacgagtcaatcagaccgctttgactctggtatta',\n",
+ " '\\t\\tcgtagcgcatcagtgctttcttactgtgagtacgcaccagcgccagaggacgacgac',\n",
+ " '\\t\\tcgaccgaagcgagcctcgtcctcaatggcctctaaacgggtcttgaggggttttttg',\n",
+ " '\\t\\tctacggtgggtacaatatgctggatggagatgcgttcacttctggtctactgactcg',\n",
+ " '\\t\\tatagtctcagagtcttgacctactacgccagcattttggcggtgtaagctaaccatt',\n",
+ " '\\t\\taactcaaggctgatacggcgagacttgcgagccttgtccttgcggtacacagcagcg',\n",
+ " '\\t\\tttactgtgaacattattcgtctccgcgactacgatgagatgcctgagtgcttccgtt',\n",
+ " '\\t\\ttattctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattga',\n",
+ " '\\t\\taacgagtcaatcagaccgctttgactctggtattactgtgaacattattcgtctccg',\n",
+ " '\\t\\taagtgcttagcttcaaggtcacggatacgaccgaagcgagcctcgtcctcaatggcc',\n",
+ " '\\t\\tgaagaccacgcctcgccaccgagtagacccttagagagcatgtcagcctcgacaact',\n",
+ " '\\t\\tttagagagcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccct',\n",
+ " '\\t\\ttattcgtctccgcgactacgatgagatgcctgagtgcttccgttactggattgtcac',\n",
+ " '\\t\\ttgctgaaaggaggaactatatgcgctcatacgatatgaacgttgagactgccgctga',\n",
+ " '\\t\\tcatgaactcaaggctgatacggcgagacttgcgagccttgtccttgcggtacacagc',\n",
+ " '\\t\\tttcgtctccgcgactacgatgagatgcctgagtgcttccgttactggattgtcacca',\n",
+ " '\\t\\tcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccctacgcgctt',\n",
+ " '\\t\\taggaggaactacgcaaggttggaacatcggagagatgccagccagcgcacctgcacg',\n",
+ " '\\t\\ttctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattgagga',\n",
+ " '\\t\\ttgaagtgcttagcttcaaggtcacggatacgaccgaagcgagcctcgtcctcaatgg',\n",
+ " '\\t\\tctatatgcgctcatacgatatgaacgttgagactgccgctgagttatcagctgtgaa',\n",
+ " '\\t\\tgcggcagcacgtttccacgcggtgagagcctcaggattcatgtcgatgtcttccggt',\n",
+ " '\\t\\tatccctaatgtctacttccggtcaatccatctacgttaaccgaggtggctatgtgta',\n",
+ " '\\t\\ttggcgtctatcggtgaacctccggtatcaacgctggaaggtgacgctaacgcagatg',\n",
+ " '\\t\\ttctcgtggatggacgttcaacattgaggaaggcataacgctactacctgatgtttac',\n",
+ " '\\t\\ttattggcttgctcaagcatgaactcaaggctgatacggcgagacttgcgagccttgt',\n",
+ " '\\t\\ttagagggtgtactccaagaagaggaagatgaggctagacgtctctgcatggagtatg',\n",
+ " '\\t\\tcagcggcagcacgtttccacgcggtgagagcctcaggattcatgtcgatgtcttccg',\n",
+ " '\\t\\tttacgttggcgaccgctaggactttcttgttgattttccatgcggtgttttgcgcaa',\n",
+ " '\\t\\tacgctaacgcagatgcagcgaacgctcggcgtattctcaacaagattaaccgacaga',\n",
+ " '\\t\\tggtgttttgcgcaatgttaatcgctttgtacacctcaggcatgtaaacgtcttcgta',\n",
+ " '\\t\\taaccattccggttgactcaatgagcatctcgatgcagcgtactcctacatgaataga',\n",
+ " '\\t\\tagacgtctctgcatggagtatgagatggactacggtgggtacaatatgctggatgga',\n",
+ " '\\t\\ttgttgattttccatgcggtgttttgcgcaatgttaatcgctttgtacacctcaggca',\n",
+ " '\\t\\ttgcacgggttgcgatagcctcagcgtattcaggtgcgagttcgatagtctcagagtc',\n",
+ " '\\t\\taggcatgtaaacgtcttcgtagcgcatcagtgctttcttactgtgagtacgcaccag',\n",
+ " '\\t\\tccgagtagacccttagagagcatgtcagcctcgacaacttgcataaatgctttcttg',\n",
+ " '\\t\\tcgctaggactttcttgttgattttccatgcggtgttttgcgcaatgttaatcgcttt',\n",
+ " '\\t\\ttatgaccgaacgagtcaatcagaccgctttgactctggtattactgtgaacattatt',\n",
+ " '\\t\\tagagggtgtactccaagaagaggaagatgaggctagacgtctctgcatggagtatga',\n",
+ " '\\t\\tgagagcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccctacg',\n",
+ " '\\t\\tcctcaatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaactatat',\n",
+ " '\\t\\tgtattctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattg',\n",
+ " '\\t\\tcgcgactacgatgagatgcctgagtgcttccgttactggattgtcaccaaggcttcc',\n",
+ " '\\t\\tctcgtcctcaatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaac',\n",
+ " '\\t\\ttaacattaataaataaggaggctctaatggcactcattagccaatcaatcaagaact']"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 11
+ }
+ ],
+ "source": [
+ "# generate list of DNA sequence\n",
+ "sequence = list(data.loc[:, 'Sequence'])\n",
+ "sequence"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "rhSkfaKO6Pfr",
+ "outputId": "a9afc6e9-690e-40c8-e217-f11764534d5e"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "['t',\n",
+ " 'a',\n",
+ " 'c',\n",
+ " 't',\n",
+ " 'a',\n",
+ " 'g',\n",
+ " 'c',\n",
+ " 'a',\n",
+ " 'a',\n",
+ " 't',\n",
+ " 'a',\n",
+ " 'c',\n",
+ " 'g',\n",
+ " 'c',\n",
+ " 't',\n",
+ " 't',\n",
+ " 'g',\n",
+ " 'c',\n",
+ " 'g',\n",
+ " 't',\n",
+ " 't',\n",
+ " 'c',\n",
+ " 'g',\n",
+ " 'g',\n",
+ " 't',\n",
+ " 'g',\n",
+ " 'g',\n",
+ " 't',\n",
+ " 't',\n",
+ " 'a',\n",
+ " 'a',\n",
+ " 'g',\n",
+ " 't',\n",
+ " 'a',\n",
+ " 't',\n",
+ " 'g',\n",
+ " 't',\n",
+ " 'a',\n",
+ " 't',\n",
+ " 'a',\n",
+ " 'a',\n",
+ " 't',\n",
+ " 'g',\n",
+ " 'c',\n",
+ " 'g',\n",
+ " 'c',\n",
+ " 'g',\n",
+ " 'g',\n",
+ " 'g',\n",
+ " 'c',\n",
+ " 't',\n",
+ " 't',\n",
+ " 'g',\n",
+ " 't',\n",
+ " 'c',\n",
+ " 'g',\n",
+ " 't',\n",
+ " '+']"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 12
+ }
+ ],
+ "source": [
+ "#Remove tab from each sequence\n",
+ "dic = {}\n",
+ "for i, seq in enumerate(sequence):\n",
+ " nucleotides = list(seq)\n",
+ " nucleotides = [char for char in nucleotides if char != '\\t']\n",
+ " #append class assignment\n",
+ " nucleotides.append(clases[i])\n",
+ "\n",
+ " dic[i] = nucleotides\n",
+ "dic[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 235
+ },
+ "id": "xtt11Jdi6Pfs",
+ "outputId": "5fb27c39-d2ff-4599-b407-e7b6ba45cbc2"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " 0 1 2 3 4 5 6 7 8 9 ... 96 97 98 99 100 101 102 \\\n",
+ "0 t t g a t a c t c t ... c c t a g c g \n",
+ "1 a g t a c g a t g t ... c g a g a c t \n",
+ "2 c c a t g g g t a t ... g c t a g t a \n",
+ "3 t t c t a g g c c t ... a t g g a c t \n",
+ "4 a a t g t g g t t a ... g a a g g a t \n",
+ "\n",
+ " 103 104 105 \n",
+ "0 c c t \n",
+ "1 g t a \n",
+ "2 c c a \n",
+ "3 g g c \n",
+ "4 a t a \n",
+ "\n",
+ "[5 rows x 106 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 2 \n",
+ " 3 \n",
+ " 4 \n",
+ " 5 \n",
+ " 6 \n",
+ " 7 \n",
+ " 8 \n",
+ " 9 \n",
+ " ... \n",
+ " 96 \n",
+ " 97 \n",
+ " 98 \n",
+ " 99 \n",
+ " 100 \n",
+ " 101 \n",
+ " 102 \n",
+ " 103 \n",
+ " 104 \n",
+ " 105 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " t \n",
+ " t \n",
+ " g \n",
+ " a \n",
+ " t \n",
+ " a \n",
+ " c \n",
+ " t \n",
+ " c \n",
+ " t \n",
+ " ... \n",
+ " c \n",
+ " c \n",
+ " t \n",
+ " a \n",
+ " g \n",
+ " c \n",
+ " g \n",
+ " c \n",
+ " c \n",
+ " t \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " a \n",
+ " g \n",
+ " t \n",
+ " a \n",
+ " c \n",
+ " g \n",
+ " a \n",
+ " t \n",
+ " g \n",
+ " t \n",
+ " ... \n",
+ " c \n",
+ " g \n",
+ " a \n",
+ " g \n",
+ " a \n",
+ " c \n",
+ " t \n",
+ " g \n",
+ " t \n",
+ " a \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " c \n",
+ " c \n",
+ " a \n",
+ " t \n",
+ " g \n",
+ " g \n",
+ " g \n",
+ " t \n",
+ " a \n",
+ " t \n",
+ " ... \n",
+ " g \n",
+ " c \n",
+ " t \n",
+ " a \n",
+ " g \n",
+ " t \n",
+ " a \n",
+ " c \n",
+ " c \n",
+ " a \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " t \n",
+ " t \n",
+ " c \n",
+ " t \n",
+ " a \n",
+ " g \n",
+ " g \n",
+ " c \n",
+ " c \n",
+ " t \n",
+ " ... \n",
+ " a \n",
+ " t \n",
+ " g \n",
+ " g \n",
+ " a \n",
+ " c \n",
+ " t \n",
+ " g \n",
+ " g \n",
+ " c \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " a \n",
+ " a \n",
+ " t \n",
+ " g \n",
+ " t \n",
+ " g \n",
+ " g \n",
+ " t \n",
+ " t \n",
+ " a \n",
+ " ... \n",
+ " g \n",
+ " a \n",
+ " a \n",
+ " g \n",
+ " g \n",
+ " a \n",
+ " t \n",
+ " a \n",
+ " t \n",
+ " a \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
5 rows × 106 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "df"
+ }
+ },
+ "metadata": {},
+ "execution_count": 13
+ }
+ ],
+ "source": [
+ "# Convert Dict object into dataframe\n",
+ "df = pd.DataFrame(dic)\n",
+ "df.head()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 235
+ },
+ "id": "Db8jOvEE6Pft",
+ "outputId": "050dc20a-6ba1-441e-a560-373cbe52b8c3"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " 0 1 2 3 4 5 6 7 8 9 ... 48 49 50 51 52 53 54 55 56 57\n",
+ "0 t a c t a g c a a t ... g c t t g t c g t +\n",
+ "1 t g c t a t c c t g ... c a t c g c c a a +\n",
+ "2 g t a c t a g a g a ... c a c c c g g c g +\n",
+ "3 a a t t g t g a t g ... a a c a a a c t c +\n",
+ "4 t c g a t a a t t a ... c c g t g g t a g +\n",
+ "\n",
+ "[5 rows x 58 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 2 \n",
+ " 3 \n",
+ " 4 \n",
+ " 5 \n",
+ " 6 \n",
+ " 7 \n",
+ " 8 \n",
+ " 9 \n",
+ " ... \n",
+ " 48 \n",
+ " 49 \n",
+ " 50 \n",
+ " 51 \n",
+ " 52 \n",
+ " 53 \n",
+ " 54 \n",
+ " 55 \n",
+ " 56 \n",
+ " 57 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " t \n",
+ " a \n",
+ " c \n",
+ " t \n",
+ " a \n",
+ " g \n",
+ " c \n",
+ " a \n",
+ " a \n",
+ " t \n",
+ " ... \n",
+ " g \n",
+ " c \n",
+ " t \n",
+ " t \n",
+ " g \n",
+ " t \n",
+ " c \n",
+ " g \n",
+ " t \n",
+ " + \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " t \n",
+ " g \n",
+ " c \n",
+ " t \n",
+ " a \n",
+ " t \n",
+ " c \n",
+ " c \n",
+ " t \n",
+ " g \n",
+ " ... \n",
+ " c \n",
+ " a \n",
+ " t \n",
+ " c \n",
+ " g \n",
+ " c \n",
+ " c \n",
+ " a \n",
+ " a \n",
+ " + \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " g \n",
+ " t \n",
+ " a \n",
+ " c \n",
+ " t \n",
+ " a \n",
+ " g \n",
+ " a \n",
+ " g \n",
+ " a \n",
+ " ... \n",
+ " c \n",
+ " a \n",
+ " c \n",
+ " c \n",
+ " c \n",
+ " g \n",
+ " g \n",
+ " c \n",
+ " g \n",
+ " + \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " a \n",
+ " a \n",
+ " t \n",
+ " t \n",
+ " g \n",
+ " t \n",
+ " g \n",
+ " a \n",
+ " t \n",
+ " g \n",
+ " ... \n",
+ " a \n",
+ " a \n",
+ " c \n",
+ " a \n",
+ " a \n",
+ " a \n",
+ " c \n",
+ " t \n",
+ " c \n",
+ " + \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " t \n",
+ " c \n",
+ " g \n",
+ " a \n",
+ " t \n",
+ " a \n",
+ " a \n",
+ " t \n",
+ " t \n",
+ " a \n",
+ " ... \n",
+ " c \n",
+ " c \n",
+ " g \n",
+ " t \n",
+ " g \n",
+ " g \n",
+ " t \n",
+ " a \n",
+ " g \n",
+ " + \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
5 rows × 58 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "df"
+ }
+ },
+ "metadata": {},
+ "execution_count": 14
+ }
+ ],
+ "source": [
+ "# transpose dataframe into correct format\n",
+ "df = df.transpose()\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "VO6oTsFo6Pfu",
+ "outputId": "52eb6610-e876-46e4-8aab-5dd287344501"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "RangeIndex(start=0, stop=58, step=1)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 15
+ }
+ ],
+ "source": [
+ "df.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "id": "PGRI9uyc6Pfv"
+ },
+ "outputs": [],
+ "source": [
+ "# Rename\n",
+ "df.rename(columns = {57:'Class'}, inplace = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "QqQVl-wY6Pfw",
+ "outputId": "b5241da6-014b-4d2f-fee9-97908db79712"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Index([ 0, 1, 2, 3, 4, 5, 6, 7,\n",
+ " 8, 9, 10, 11, 12, 13, 14, 15,\n",
+ " 16, 17, 18, 19, 20, 21, 22, 23,\n",
+ " 24, 25, 26, 27, 28, 29, 30, 31,\n",
+ " 32, 33, 34, 35, 36, 37, 38, 39,\n",
+ " 40, 41, 42, 43, 44, 45, 46, 47,\n",
+ " 48, 49, 50, 51, 52, 53, 54, 55,\n",
+ " 56, 'Class'],\n",
+ " dtype='object')"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 17
+ }
+ ],
+ "source": [
+ "df.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 235
+ },
+ "id": "hbHhNFpH6Pfx",
+ "outputId": "196821ab-7fbc-4089-c951-03106e15e353"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " 0 1 2 3 4 5 6 7 8 9 ... 48 49 50 51 52 53 54 55 56 Class\n",
+ "0 t a c t a g c a a t ... g c t t g t c g t +\n",
+ "1 t g c t a t c c t g ... c a t c g c c a a +\n",
+ "2 g t a c t a g a g a ... c a c c c g g c g +\n",
+ "3 a a t t g t g a t g ... a a c a a a c t c +\n",
+ "4 t c g a t a a t t a ... c c g t g g t a g +\n",
+ "\n",
+ "[5 rows x 58 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 2 \n",
+ " 3 \n",
+ " 4 \n",
+ " 5 \n",
+ " 6 \n",
+ " 7 \n",
+ " 8 \n",
+ " 9 \n",
+ " ... \n",
+ " 48 \n",
+ " 49 \n",
+ " 50 \n",
+ " 51 \n",
+ " 52 \n",
+ " 53 \n",
+ " 54 \n",
+ " 55 \n",
+ " 56 \n",
+ " Class \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " t \n",
+ " a \n",
+ " c \n",
+ " t \n",
+ " a \n",
+ " g \n",
+ " c \n",
+ " a \n",
+ " a \n",
+ " t \n",
+ " ... \n",
+ " g \n",
+ " c \n",
+ " t \n",
+ " t \n",
+ " g \n",
+ " t \n",
+ " c \n",
+ " g \n",
+ " t \n",
+ " + \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " t \n",
+ " g \n",
+ " c \n",
+ " t \n",
+ " a \n",
+ " t \n",
+ " c \n",
+ " c \n",
+ " t \n",
+ " g \n",
+ " ... \n",
+ " c \n",
+ " a \n",
+ " t \n",
+ " c \n",
+ " g \n",
+ " c \n",
+ " c \n",
+ " a \n",
+ " a \n",
+ " + \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " g \n",
+ " t \n",
+ " a \n",
+ " c \n",
+ " t \n",
+ " a \n",
+ " g \n",
+ " a \n",
+ " g \n",
+ " a \n",
+ " ... \n",
+ " c \n",
+ " a \n",
+ " c \n",
+ " c \n",
+ " c \n",
+ " g \n",
+ " g \n",
+ " c \n",
+ " g \n",
+ " + \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " a \n",
+ " a \n",
+ " t \n",
+ " t \n",
+ " g \n",
+ " t \n",
+ " g \n",
+ " a \n",
+ " t \n",
+ " g \n",
+ " ... \n",
+ " a \n",
+ " a \n",
+ " c \n",
+ " a \n",
+ " a \n",
+ " a \n",
+ " c \n",
+ " t \n",
+ " c \n",
+ " + \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " t \n",
+ " c \n",
+ " g \n",
+ " a \n",
+ " t \n",
+ " a \n",
+ " a \n",
+ " t \n",
+ " t \n",
+ " a \n",
+ " ... \n",
+ " c \n",
+ " c \n",
+ " g \n",
+ " t \n",
+ " g \n",
+ " g \n",
+ " t \n",
+ " a \n",
+ " g \n",
+ " + \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
5 rows × 58 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "df"
+ }
+ },
+ "metadata": {},
+ "execution_count": 18
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 235
+ },
+ "id": "C-J6FSJI6Pfx",
+ "outputId": "af18f038-b045-4df1-c205-12f3d8b1318f"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " 0_a 0_c 0_g 0_t 1_a 1_c 1_g 1_t 2_a 2_c ... \\\n",
+ "0 False False False True True False False False False True ... \n",
+ "1 False False False True False False True False False True ... \n",
+ "2 False False True False False False False True True False ... \n",
+ "3 True False False False True False False False False False ... \n",
+ "4 False False False True False True False False False False ... \n",
+ "\n",
+ " 55_a 55_c 55_g 55_t 56_a 56_c 56_g 56_t Class_+ Class_- \n",
+ "0 False False True False False False False True True False \n",
+ "1 True False False False True False False False True False \n",
+ "2 False True False False False False True False True False \n",
+ "3 False False False True False True False False True False \n",
+ "4 True False False False False False True False True False \n",
+ "\n",
+ "[5 rows x 230 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0_a \n",
+ " 0_c \n",
+ " 0_g \n",
+ " 0_t \n",
+ " 1_a \n",
+ " 1_c \n",
+ " 1_g \n",
+ " 1_t \n",
+ " 2_a \n",
+ " 2_c \n",
+ " ... \n",
+ " 55_a \n",
+ " 55_c \n",
+ " 55_g \n",
+ " 55_t \n",
+ " 56_a \n",
+ " 56_c \n",
+ " 56_g \n",
+ " 56_t \n",
+ " Class_+ \n",
+ " Class_- \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " ... \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " True \n",
+ " False \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " ... \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " True \n",
+ " False \n",
+ " ... \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " ... \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " ... \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
5 rows × 230 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "numerical_df"
+ }
+ },
+ "metadata": {},
+ "execution_count": 19
+ }
+ ],
+ "source": [
+ "#Encoding\n",
+ "numerical_df = pd.get_dummies(df)\n",
+ "numerical_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 235
+ },
+ "id": "NtOIsbv06Pfy",
+ "outputId": "3dae252b-f091-46b5-ec1c-1ab44f766c73"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " 0_a 0_c 0_g 0_t 1_a 1_c 1_g 1_t 2_a 2_c ... \\\n",
+ "0 False False False True True False False False False True ... \n",
+ "1 False False False True False False True False False True ... \n",
+ "2 False False True False False False False True True False ... \n",
+ "3 True False False False True False False False False False ... \n",
+ "4 False False False True False True False False False False ... \n",
+ "\n",
+ " 54_t 55_a 55_c 55_g 55_t 56_a 56_c 56_g 56_t Class_+ \n",
+ "0 False False False True False False False False True True \n",
+ "1 False True False False False True False False False True \n",
+ "2 False False True False False False False True False True \n",
+ "3 False False False False True False True False False True \n",
+ "4 True True False False False False False True False True \n",
+ "\n",
+ "[5 rows x 229 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0_a \n",
+ " 0_c \n",
+ " 0_g \n",
+ " 0_t \n",
+ " 1_a \n",
+ " 1_c \n",
+ " 1_g \n",
+ " 1_t \n",
+ " 2_a \n",
+ " 2_c \n",
+ " ... \n",
+ " 54_t \n",
+ " 55_a \n",
+ " 55_c \n",
+ " 55_g \n",
+ " 55_t \n",
+ " 56_a \n",
+ " 56_c \n",
+ " 56_g \n",
+ " 56_t \n",
+ " Class_+ \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " ... \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " True \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " ... \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " True \n",
+ " False \n",
+ " ... \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " True \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " ... \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " ... \n",
+ " True \n",
+ " True \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " True \n",
+ " False \n",
+ " True \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
5 rows × 229 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "numerical_df"
+ }
+ },
+ "metadata": {},
+ "execution_count": 20
+ }
+ ],
+ "source": [
+ "# Drop class_- or Class_+ either of one\n",
+ "numerical_df.drop('Class_-', axis = 1, inplace = True)\n",
+ "numerical_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {
+ "id": "iGyUCF226Pfz"
+ },
+ "outputs": [],
+ "source": [
+ "# rename Class_+ to Class\n",
+ "numerical_df.rename(columns = {'Class_+':'Class'}, inplace = True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "cj76tLTB6Pfz"
+ },
+ "source": [
+ "## Step 3: Training and Testing the Classification Algorithms\n",
+ "\n",
+ "Now that we have preprocessed the data and built our training and testing datasets, we can start to deploy different classification algorithms. It's relatively easy to test multiple models; as a result, we will compare and contrast the performance of ten different algorithms."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {
+ "id": "JSSgxqzG6Pf0"
+ },
+ "outputs": [],
+ "source": [
+ "#Importing different classifier from sklearn\n",
+ "from sklearn.neighbors import KNeighborsClassifier\n",
+ "from sklearn.tree import DecisionTreeClassifier\n",
+ "from sklearn import svm\n",
+ "from sklearn.naive_bayes import GaussianNB\n",
+ "from sklearn.gaussian_process.kernels import RBF\n",
+ "from sklearn.gaussian_process import GaussianProcessClassifier\n",
+ "from sklearn.neural_network import MLPClassifier\n",
+ "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n",
+ "from sklearn.metrics import classification_report, accuracy_score"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {
+ "id": "F2yJxt9S6Pf0"
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn.model_selection import train_test_split\n",
+ "X = numerical_df.drop(['Class'], axis = 1).values\n",
+ "y = numerical_df['Class'].values\n",
+ "\n",
+ "#define a seed for reproducibility\n",
+ "seed = 1\n",
+ "\n",
+ "# Splitting data into training and testing data\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = seed)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "2KLQMlHW6Pf0",
+ "outputId": "417ba0b4-4494-4bfb-da56-fc099e1cfa84"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "K Nearest Neighbors: 0.8107142857142857 (0.099808490089158)\n",
+ "Gaussian Process: 0.8553571428571429 (0.1606051216556957)\n",
+ "Decision Tree: 0.6821428571428572 (0.1316593470797626)\n",
+ "Random Forest: 0.5714285714285714 (0.19232731454051802)\n",
+ "Neural Net: 0.9125 (0.09762812094883318)\n",
+ "AdaBoost: 0.8625 (0.14197270864500683)\n",
+ "Naive Bayes: 0.8375 (0.1125)\n",
+ "SVM Linear: 0.9125 (0.09762812094883318)\n",
+ "SVM RBF: 0.875 (0.11180339887498948)\n",
+ "SVM Sigmoid: 0.925 (0.1)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Define scoring method\n",
+ "scoring = 'accuracy'\n",
+ "# Model building to train\n",
+ "names = ['K Nearest Neighbors', 'Gaussian Process', 'Decision Tree', 'Random Forest', 'Neural Net', 'AdaBoost', 'Naive Bayes', 'SVM Linear', 'SVM RBF', 'SVM Sigmoid']\n",
+ "Classifiers = [\n",
+ " KNeighborsClassifier(n_neighbors=3),\n",
+ " GaussianProcessClassifier(1.0*RBF(1.0)),\n",
+ " DecisionTreeClassifier(max_depth=5),\n",
+ " RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),\n",
+ " MLPClassifier(alpha=1),\n",
+ " AdaBoostClassifier(),\n",
+ " GaussianNB(),\n",
+ " svm.SVC(kernel='linear'),\n",
+ " svm.SVC(kernel='rbf'),\n",
+ " svm.SVC(kernel='sigmoid')\n",
+ "]\n",
+ "models = zip(names, Classifiers)\n",
+ "\n",
+ "# import KFold\n",
+ "from sklearn.model_selection import KFold, cross_val_score\n",
+ "\n",
+ "names = []\n",
+ "result = []\n",
+ "for name, model in models:\n",
+ " kfold = KFold(n_splits=10, shuffle=True, random_state=1)\n",
+ " cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')\n",
+ " result.append(cv_results)\n",
+ " names.append(name)\n",
+ " msg = \"{0}: {1} ({2})\".format(name, cv_results.mean(), cv_results.std())\n",
+ " print(msg)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "misVAgvl6Pf1"
+ },
+ "source": [
+ "## Step 4 : Model Evaluation\n",
+ "\n",
+ "Now that we will evaluate our classification algorithms using accuracy score and classification report."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "IwbJR7Nz6Pf2",
+ "outputId": "fe65867f-a939-4a32-d58d-dbfdd1437441"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "K Nearest Neighbors\n",
+ "0.7777777777777778\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " False 1.00 0.65 0.79 17\n",
+ " True 0.62 1.00 0.77 10\n",
+ "\n",
+ " accuracy 0.78 27\n",
+ " macro avg 0.81 0.82 0.78 27\n",
+ "weighted avg 0.86 0.78 0.78 27\n",
+ "\n",
+ "Gaussian Process\n",
+ "0.8888888888888888\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " False 1.00 0.82 0.90 17\n",
+ " True 0.77 1.00 0.87 10\n",
+ "\n",
+ " accuracy 0.89 27\n",
+ " macro avg 0.88 0.91 0.89 27\n",
+ "weighted avg 0.91 0.89 0.89 27\n",
+ "\n",
+ "Decision Tree\n",
+ "0.7407407407407407\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " False 1.00 0.59 0.74 17\n",
+ " True 0.59 1.00 0.74 10\n",
+ "\n",
+ " accuracy 0.74 27\n",
+ " macro avg 0.79 0.79 0.74 27\n",
+ "weighted avg 0.85 0.74 0.74 27\n",
+ "\n",
+ "Random Forest\n",
+ "0.4444444444444444\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " False 0.67 0.24 0.35 17\n",
+ " True 0.38 0.80 0.52 10\n",
+ "\n",
+ " accuracy 0.44 27\n",
+ " macro avg 0.52 0.52 0.43 27\n",
+ "weighted avg 0.56 0.44 0.41 27\n",
+ "\n",
+ "Neural Net\n",
+ "0.8888888888888888\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " False 1.00 0.82 0.90 17\n",
+ " True 0.77 1.00 0.87 10\n",
+ "\n",
+ " accuracy 0.89 27\n",
+ " macro avg 0.88 0.91 0.89 27\n",
+ "weighted avg 0.91 0.89 0.89 27\n",
+ "\n",
+ "AdaBoost\n",
+ "0.8518518518518519\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " False 1.00 0.76 0.87 17\n",
+ " True 0.71 1.00 0.83 10\n",
+ "\n",
+ " accuracy 0.85 27\n",
+ " macro avg 0.86 0.88 0.85 27\n",
+ "weighted avg 0.89 0.85 0.85 27\n",
+ "\n",
+ "Naive Bayes\n",
+ "0.9259259259259259\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " False 1.00 0.88 0.94 17\n",
+ " True 0.83 1.00 0.91 10\n",
+ "\n",
+ " accuracy 0.93 27\n",
+ " macro avg 0.92 0.94 0.92 27\n",
+ "weighted avg 0.94 0.93 0.93 27\n",
+ "\n",
+ "SVM Linear\n",
+ "0.9629629629629629\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " False 1.00 0.94 0.97 17\n",
+ " True 0.91 1.00 0.95 10\n",
+ "\n",
+ " accuracy 0.96 27\n",
+ " macro avg 0.95 0.97 0.96 27\n",
+ "weighted avg 0.97 0.96 0.96 27\n",
+ "\n",
+ "SVM RBF\n",
+ "0.9259259259259259\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " False 1.00 0.88 0.94 17\n",
+ " True 0.83 1.00 0.91 10\n",
+ "\n",
+ " accuracy 0.93 27\n",
+ " macro avg 0.92 0.94 0.92 27\n",
+ "weighted avg 0.94 0.93 0.93 27\n",
+ "\n",
+ "SVM Sigmoid\n",
+ "0.9259259259259259\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " False 1.00 0.88 0.94 17\n",
+ " True 0.83 1.00 0.91 10\n",
+ "\n",
+ " accuracy 0.93 27\n",
+ " macro avg 0.92 0.94 0.92 27\n",
+ "weighted avg 0.94 0.93 0.93 27\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "#Test the algorithm on the test data set\n",
+ "models = zip(names, Classifiers)\n",
+ "for name, model in models:\n",
+ " model.fit(X_train, y_train)\n",
+ " y_pred = model.predict(X_test)\n",
+ " print(name)\n",
+ " print(accuracy_score(y_test, y_pred))\n",
+ " print(classification_report(y_test, y_pred))\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Lv6YkrVY6Pf2"
+ },
+ "source": [
+ "## Conclusion :"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "av-mepng6Pf3"
+ },
+ "source": [
+ "#### From above report, Support Vector Machine with 'linear' kernel performed best with F1_score = 0.96 on testing data. "
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.3"
+ },
+ "colab": {
+ "provenance": []
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/DNA CLASSIFICATION/promoters.data b/DNA CLASSIFICATION/promoters.data
new file mode 100644
index 00000000..ec211eac
--- /dev/null
+++ b/DNA CLASSIFICATION/promoters.data
@@ -0,0 +1,106 @@
++,S10, tactagcaatacgcttgcgttcggtggttaagtatgtataatgcgcgggcttgtcgt
++,AMPC, tgctatcctgacagttgtcacgctgattggtgtcgttacaatctaacgcatcgccaa
++,AROH, gtactagagaactagtgcattagcttatttttttgttatcatgctaaccacccggcg
++,DEOP2, aattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaatactaacaaactc
++,LEU1_TRNA, tcgataattaactattgacgaaaagctgaaaaccactagaatgcgcctccgtggtag
++,MALEFG, aggggcaaggaggatggaaagaggttgccgtataaagaaactagagtccgtttaggt
++,MALK, cagggggtggaggatttaagccatctcctgatgacgcatagtcagcccatcatgaat
++,RECA, tttctacaaaacacttgatactgtatgagcatacagtataattgcttcaacagaaca
++,RPOB, cgacttaatatactgcgacaggacgtccgttctgtgtaaatcgcaatgaaatggttt
++,RRNAB_P1, ttttaaatttcctcttgtcaggccggaataactccctataatgcgccaccactgaca
++,RRNAB_P2, gcaaaaataaatgcttgactctgtagcgggaaggcgtattatgcacaccccgcgccg
++,RRNDEX_P2, cctgaaattcagggttgactctgaaagaggaaagcgtaatatacgccacctcgcgac
++,RRND_P1, gatcaaaaaaatacttgtgcaaaaaattgggatccctataatgcgcctccgttgaga
++,RRNE_P1, ctgcaatttttctattgcggcctgcggagaactccctataatgcgcctccatcgaca
++,RRNG_P1, tttatatttttcgcttgtcaggccggaataactccctataatgcgccaccactgaca
++,RRNG_P2, aagcaaagaaatgcttgactctgtagcgggaaggcgtattatgcacaccgccgcgcc
++,RRNX_P1, atgcatttttccgcttgtcttcctgagccgactccctataatgcgcctccatcgaca
++,TNAA, aaacaatttcagaatagacaaaaactctgagtgtaataatgtagcctcgtgtcttgc
++,TYRT, tctcaacgtaacactttacagcggcgcgtcatttgatatgatgcgccccgcttcccg
++,ARAC, gcaaataatcaatgtggacttttctgccgtgattatagacacttttgttacgcgttt
++,LACI, gacaccatcgaatggcgcaaaacctttcgcggtatggcatgatagcgcccggaagag
++,MALT, aaaaacgtcatcgcttgcattagaaaggtttctggccgaccttataaccattaatta
++,TRP, tctgaaatgagctgttgacaattaatcatcgaactagttaactagtacgcaagttca
++,TRPP2, accggaagaaaaccgtgacattttaacacgtttgttacaaggtaaaggcgacgccgc
++,THR, aaattaaaattttattgacttaggtcactaaatactttaaccaatataggcatagcg
++,BIOB, ttgtcataatcgacttgtaaaccaaattgaaaagatttaggtttacaagtctacacc
++,FOL, catcctcgcaccagtcgacgacggtttacgctttacgtatagtggcgacaatttttt
++,UVRBP1, tccagtataatttgttggcataattaagtacgacgagtaaaattacatacctgcccg
++,UVRBP3, acagttatccactattcctgtggataaccatgtgtattagagttagaaaacacgagg
++,LEXA, tgtgcagtttatggttccaaaatcgccttttgctgtatatactcacagcataactgt
++,PORI-L, ctgttgttcagtttttgagttgtgtataacccctcattctgatcccagcttatacgg
++,SPOT42, attacaaaaagtgctttctgaactgaacaaaaaagagtaaagttagtcgcgtagggt
++,M1RNA, atgcgcaacgcggggtgacaagggcgcgcaaaccctctatactgcgcgccgaagctg
++,GLNS, taaaaaactaacagttgtcagcctgtcccgcttataagatcatacgccgttatacgt
++,TUFB, atgcaattttttagttgcatgaactcgcatgtctccatagaatgcgcgctacttgat
++,SUBB-E, ccttgaaaaagaggttgacgctgcaaggctctatacgcataatgcgccccgcaacgc
++,STR, tcgttgtatatttcttgacaccttttcggcatcgccctaaaattcggcgtcctcata
++,SPC, ccgtttattttttctacccatatccttgaagcggtgttataatgccgcgccctcgat
++,RPOA, ttcgcatatttttcttgcaaagttgggttgagctggctagattagccagccaatctt
++,RPLJ, tgtaaactaatgcctttacgtgggcggtgattttgtctacaatcttacccccacgta
++,PORI-R, gatcgcacgatctgtatacttatttgagtaaattaacccacgatcccagccattctt
++,ALAS, aacgcatacggtattttaccttcccagtcaagaaaacttatcttattcccacttttc
++,ARABAD, ttagcggatcctacctgacgctttttatcgcaactctctactgtttctccatacccg
++,BIOA, gccttctccaaaacgtgttttttgttgttaattcggtgtagacttgtaaacctaaat
++,DEOP1, cagaaacgttttattcgaacatcgatctcgtcttgtgttagaattctaacatacggt
++,GALP2, cactaatttattccatgtcacacttttcgcatctttgttatgctatggttatttcat
++,HIS, atataaaaaagttcttgctttctaacgtgaaagtggtttaggttaaaagacatcagt
++,HISJ, caaggtagaatgctttgccttgtcggcctgattaatggcacgatagtcgcatcggat
++,ILVGEDA, ggccaaaaaatatcttgtactatttacaaaacctatggtaactctttaggcattcct
++,LACP1, taggcaccccaggctttacactttatgcttccggctcgtatgttgtgtggaattgtg
++,LPP, ccatcaaaaaaatattctcaacataaaaaactttgtgtaatacttgtaacgctacat
++,TRPR, tggggacgtcgttactgatccgcacgtttatgatatgctatcgtactctttagcgag
++,UVRB_P2, tcagaaatattatggtgatgaactgtttttttatccagtataatttgttggcataat
+-, 867, atatgaacgttgagactgccgctgagttatcagctgtgaacgacattctggcgtcta
+-,1169, cgaacgagtcaatcagaccgctttgactctggtattactgtgaacattattcgtctc
+-, 802, caatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaactatatgcg
+-, 521, ttgacctactacgccagcattttggcggtgtaagctaaccattccggttgactcaat
+-, 918, cgtctatcggtgaacctccggtatcaacgctggaaggtgacgctaacgcagatgcag
+-,1481, gccaatcaatcaagaacttgaagggtggtatcagccaacagcctgacatccttcgtt
+-,1024, tggatggacgttcaacattgaggaaggcataacgctactacctgatgtttactccaa
+-,1149, gaggtggctatgtgtatgaccgaacgagtcaatcagaccgctttgactctggtatta
+-, 313, cgtagcgcatcagtgctttcttactgtgagtacgcaccagcgccagaggacgacgac
+-, 780, cgaccgaagcgagcctcgtcctcaatggcctctaaacgggtcttgaggggttttttg
+-,1384, ctacggtgggtacaatatgctggatggagatgcgttcacttctggtctactgactcg
+-, 507, atagtctcagagtcttgacctactacgccagcattttggcggtgtaagctaaccatt
+-, 39, aactcaaggctgatacggcgagacttgcgagccttgtccttgcggtacacagcagcg
+-,1203, ttactgtgaacattattcgtctccgcgactacgatgagatgcctgagtgcttccgtt
+-, 988, tattctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattga
+-,1171, aacgagtcaatcagaccgctttgactctggtattactgtgaacattattcgtctccg
+-, 753, aagtgcttagcttcaaggtcacggatacgaccgaagcgagcctcgtcctcaatggcc
+-, 630, gaagaccacgcctcgccaccgagtagacccttagagagcatgtcagcctcgacaact
+-, 660, ttagagagcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccct
+-,1216, tattcgtctccgcgactacgatgagatgcctgagtgcttccgttactggattgtcac
+-, 835, tgctgaaaggaggaactatatgcgctcatacgatatgaacgttgagactgccgctga
+-, 35, catgaactcaaggctgatacggcgagacttgcgagccttgtccttgcggtacacagc
+-,1218, ttcgtctccgcgactacgatgagatgcctgagtgcttccgttactggattgtcacca
+-, 668, catgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccctacgcgctt
+-, 413, aggaggaactacgcaaggttggaacatcggagagatgccagccagcgcacctgcacg
+-, 991, tctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattgagga
+-, 751, tgaagtgcttagcttcaaggtcacggatacgaccgaagcgagcctcgtcctcaatgg
+-, 850, ctatatgcgctcatacgatatgaacgttgagactgccgctgagttatcagctgtgaa
+-, 93, gcggcagcacgtttccacgcggtgagagcctcaggattcatgtcgatgtcttccggt
+-,1108, atccctaatgtctacttccggtcaatccatctacgttaaccgaggtggctatgtgta
+-, 915, tggcgtctatcggtgaacctccggtatcaacgctggaaggtgacgctaacgcagatg
+-,1019, tctcgtggatggacgttcaacattgaggaaggcataacgctactacctgatgtttac
+-, 19, tattggcttgctcaagcatgaactcaaggctgatacggcgagacttgcgagccttgt
+-,1320, tagagggtgtactccaagaagaggaagatgaggctagacgtctctgcatggagtatg
+-, 91, cagcggcagcacgtttccacgcggtgagagcctcaggattcatgtcgatgtcttccg
+-, 217, ttacgttggcgaccgctaggactttcttgttgattttccatgcggtgttttgcgcaa
+-, 957, acgctaacgcagatgcagcgaacgctcggcgtattctcaacaagattaaccgacaga
+-, 260, ggtgttttgcgcaatgttaatcgctttgtacacctcaggcatgtaaacgtcttcgta
+-, 557, aaccattccggttgactcaatgagcatctcgatgcagcgtactcctacatgaataga
+-,1355, agacgtctctgcatggagtatgagatggactacggtgggtacaatatgctggatgga
+-, 244, tgttgattttccatgcggtgttttgcgcaatgttaatcgctttgtacacctcaggca
+-, 464, tgcacgggttgcgatagcctcagcgtattcaggtgcgagttcgatagtctcagagtc
+-, 296, aggcatgtaaacgtcttcgtagcgcatcagtgctttcttactgtgagtacgcaccag
+-, 648, ccgagtagacccttagagagcatgtcagcctcgacaacttgcataaatgctttcttg
+-, 230, cgctaggactttcttgttgattttccatgcggtgttttgcgcaatgttaatcgcttt
+-,1163, tatgaccgaacgagtcaatcagaccgctttgactctggtattactgtgaacattatt
+-,1321, agagggtgtactccaagaagaggaagatgaggctagacgtctctgcatggagtatga
+-, 663, gagagcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccctacg
+-, 799, cctcaatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaactatat
+-, 987, gtattctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattg
+-,1226, cgcgactacgatgagatgcctgagtgcttccgttactggattgtcaccaaggcttcc
+-, 794, ctcgtcctcaatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaac
+-,1442, taacattaataaataaggaggctctaatggcactcattagccaatcaatcaagaact