Initial commit

galaxyproteomics · Jul 21, 2020 · 6470d4a · 6470d4a
commit 6470d4a
Show file tree

Hide file tree

Showing 14 changed files with 1,199 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,20 @@
+dist/
+build/
+fastg2protlib.egg-info/
+
+venv/
+.vscode/
+.pytest_cache/
+fastg2protlib/__pycache__/
+tests/__pycache__/
+.idea/
+*.db
+*.csv
+*.fasta
+*.mzid
+notes.txt
+run_*
+*.tabular
+
+.DS_Store
+application.py
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -0,0 +1,17 @@
+MIT License
+Copyright (c) 2018 YOUR NAME
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,34 @@
+# FASTG to Protein Library
+
+
+This package generates a candidate protein library in two phases:
+
+1) Parsing a FASTG file to create graph traversals of longer stretches of DNA
+
+    - FASTG is parsed into a directed graph. A depth-first search is made on all connecting
+            edges. The DFS traversal is then used to concatenate all DNA sequences in the path.
+    - DNA sequences are translated to mRNA and split into candidate proteins at the stop 
+            codon. Each DFS traversal can, and will, produce a set of candidate protein sequences.
+    - Protein sequences are filtered on length and amino acid redundancy. 
+    - Protein sequences are cleaved into peptide sequences.
+    - DFS traversals, proteins and peptides are stored in a SQLite database. The linking 
+            relationship between all three is maintained in the DB.
+    - A FASTA file of peptides is produced for the user. This FASTA file is to be used
+            in a search against MSMS data.
+
+2) Using verified peptides as a filter to produce a final candidate peptide library
+
+    - The user will invoke the code with 
+        - DB
+        - list of peptide sequences or peptide FASTA
+        - It is expected that the submitted peptides have been verified against MSMS and
+            they represent found and identified peptide sequences
+    - The verified peptides are used to filter proteins from the database, these
+            proteins become the final library.
+    - The verified peptides are used to score the proteins for 
+        - coverage
+        - percent of verified v. total peptide association
+    - Final user output
+        - SQLite database
+        - Protein score text file, comma delimited
+        - Filtered protein FASTA file
diff --git a/fastg2protlib/__init__.py b/fastg2protlib/__init__.py
diff --git a/fastg2protlib/db.py b/fastg2protlib/db.py
@@ -0,0 +1,211 @@
+import logging
+import os
+import sqlite3
+
+
+class DBManager:
+    def __init__(self, name="app.db"):
+        self.database_name = name
+        self.conn = sqlite3.connect(self.database_name)
+        self.cursor = self.conn.cursor()
+        self.logger = logging.getLogger(__name__)
+        self.logger.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter(
+            "%(asctime)s %(name)-12s %(levelname)-8s %(message)s"
+        )
+        handler.setFormatter(formatter)
+        self.logger.addHandler(handler)
+        # Build the tables
+        self.cursor.execute(
+            """CREATE TABLE IF NOT EXISTS walk
+                    (id INTEGER PRIMARY KEY, walk TEXT, dna_sequence TEXT)
+                """
+        )
+
+        self.cursor.execute(
+            """CREATE TABLE IF NOT EXISTS protein 
+                    (id INTEGER PRIMARY KEY, sequence TEXT)"""
+        )
+
+        self.cursor.execute(
+            """CREATE TABLE IF NOT EXISTS peptide
+                    (id INTEGER PRIMARY KEY, sequence TEXT)"""
+        )
+
+        self.cursor.execute(
+            """CREATE TABLE IF NOT EXISTS protein_to_walk
+                    (walk_id INTEGER, protein_id INTEGER)"""
+        )
+
+        self.cursor.execute(
+            """CREATE TABLE IF NOT EXISTS peptide_to_protein
+                    (peptide_id INTEGER, protein_id INTEGER)"""
+        )
+
+        self.conn.commit()
+
+    def exec_query(self, query):
+        return self.cursor.execute(query).fetchall()
+
+    def d_quote(self, str):
+        return '"' + str + '"'
+
+    def link_proteins_peptides(self, prot_peps):
+        """
+        Populate many-to-many table linking peptides to proteins
+        prot_pep object:
+        {
+            "prot_id": int,
+            "peptides": set()
+        }
+        """
+        self.logger.info("Begin link_proteins_peptides")
+        sql = sqlite3.connect(self.database_name)
+        sql.isolation_level = None
+        c = sql.cursor()
+        c.execute("begin")
+        try:
+            for obj in prot_peps:
+                protein_id = obj["prot_id"]
+                pep_seqs = []
+                for pep_seq in obj["peptides"]:
+                    pep_seqs.append(pep_seq)
+                in_clause = "(" + ",".join(list(map(self.d_quote, pep_seqs))) + ")"
+                sql_q = f"SELECT id FROM peptide WHERE sequence IN {in_clause}"
+                pep_ids = c.execute(sql_q).fetchall()
+                ins_data = []
+                for pep_id in pep_ids:
+                    ins_data.append((pep_id[0], protein_id))
+                sql_ins = f"INSERT INTO peptide_to_protein(peptide_id, protein_id) VALUES (?, ?)"
+                c.executemany(sql_ins, ins_data)
+            c.execute("commit")
+        except sql.Error:
+            self.logger.error(
+                "Transaction failed, rolling back in link_proteins_peptides"
+            )
+            c.execute("rollback")
+        self.logger.info("End link_proteins_peptides")
+
+    def insert_peptides(self, prot_peps):
+        """{
+            "prot_id": int,
+            "peptides": set()
+        }
+        """
+        self.logger.info(f"Begin Insert peptides")
+        sql = sqlite3.connect(self.database_name)
+        sql.isolation_level = None
+        c = sql.cursor()
+        c.execute("begin")
+
+        def data_iter(data):
+            """Filter out duplicate sequences
+            """
+            seen = set()
+            for item in data:
+                if item in seen:
+                    pass
+                else:
+                    seen.add(item)
+                    yield item
+
+        try:
+            pep_seqs = []
+            for pp in prot_peps:
+                for p in pp["peptides"]:
+                    pep_seqs.append((p,))
+            c.executemany(
+                "INSERT INTO peptide(sequence) VALUES (?)", data_iter(pep_seqs)
+            )
+            c.execute("commit")
+        except sql.Error:
+            self.logger.error("Transaction failed, rolling back in insert_peptides")
+            c.execute("rollback")
+        # Add index
+        self.logger.info("Creating peptide_sequence index")
+        c.execute("CREATE INDEX peptide_sequence ON peptide(sequence)")
+        self.link_proteins_peptides(prot_peps)
+        self.logger.info(f"End Insert peptides")
+
+    def build_links(self, objects):
+        """
+            Populate many-to-many for proteins to walk traversals
+        """
+        self.logger.info(f"Begin build_links")
+        sql = sqlite3.connect(self.database_name)
+        sql.isolation_level = None
+        c = sql.cursor()
+        c.execute("begin")
+        try:
+            for object in objects:
+                walk = ",".join(object["walk"])
+                w_s = f'select id from walk where walk = "{walk}"'
+                w_id = c.execute(w_s).fetchone()
+                prot_seqs = []
+                for p in object["proteins"]:
+                    prot_seqs.append(p)
+                in_clause = "(" + ",".join(list(map(self.d_quote, prot_seqs))) + ")"
+                sql_q = f"SELECT id FROM protein WHERE sequence IN {in_clause}"
+                prot_ids = c.execute(sql_q).fetchall()
+                ins_data = []
+                for prot_id in prot_ids:
+                    ins_data.append((w_id[0], prot_id[0]))
+                sql_ins = (
+                    "INSERT INTO protein_to_walk(walk_id, protein_id) VALUES (?,?)"
+                )
+                c.executemany(sql_ins, ins_data)
+            c.execute("commit")
+        except sql.Error:
+            self.logger.error("Transaction failed, rolling back in build_links")
+            c.execute("rollback")
+        self.logger.info(f"End build_links")
+
+    def insert_protein_walk_objects(self, objects):
+        self.logger.info(f"Begin insert_protein_walk_objects")
+        sql = sqlite3.connect(self.database_name)
+        sql.isolation_level = None
+        c = sql.cursor()
+        c.execute("begin")
+
+        def protein_sequence_iter(objects):
+            for object in objects:
+                for p in object["proteins"]:
+                    yield (p,)
+
+        def walk_sequence_iter(objects):
+            for object in objects:
+                walk = ",".join(object["walk"])
+                seq = object["dna_sequence"]
+                yield (walk, seq)
+
+        try:
+            c.executemany(
+                "INSERT INTO protein(sequence) VALUES (?)",
+                protein_sequence_iter(objects),
+            )
+            c.executemany(
+                "INSERT INTO walk (walk, dna_sequence) VALUES (?,?)",
+                walk_sequence_iter(objects),
+            )
+            c.execute("commit")
+        except sql.Error:
+            self.logger.error(
+                "Transaction failed, rolling back in insert_protein_walk_objects"
+            )
+            c.execute("rollback")
+        # Create indexes
+        self.logger.info("Creating indexes: walks_walk, walks_w_d and protein_sequence")
+        c.execute("CREATE INDEX walks_walk ON walk(walk)")
+        c.execute("CREATE INDEX protein_sequence ON protein(sequence)")
+        c.execute("CREATE INDEX walks_w_d ON walk(walk, dna_sequence)")
+
+        self.build_links(objects)
+        self.logger.info(f"End insert_protein_walk_objects")
+
+    def get_protein_sequences(self):
+        self.logger.info(f"Begin get_protein_sequences")
+        c = self.conn.execute("select id, sequence from protein")
+        r_val = [x for x in c]
+        self.logger.info(f"End get_protein_sequences")
+        return r_val
diff --git a/fastg2protlib/diagnostics.py b/fastg2protlib/diagnostics.py
@@ -0,0 +1,70 @@
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from Bio.SeqUtils import GC
+
+
+def _sequence_histogram(seq_lengths, title, file_name):
+    plt.hist(seq_lengths, 20, facecolor="blue", alpha=0.5)
+    plt.xlabel("Length")
+    plt.ylabel("Count")
+    plt.title(title)
+    plt.savefig(file_name)
+    plt.clf()
+
+
+def _gc_plot(sequences, title, file_name):
+    gc_values = sorted(GC(seq) for seq in sequences)
+    plt.plot(gc_values)
+    plt.title(title)
+    plt.xlabel("ORFs")
+    plt.ylabel("GC%")
+    plt.savefig(file_name)
+    plt.clf()
+
+
+def _aa_barchart(prot_sequences):
+    aa_count = defaultdict(int)
+    for sequence in prot_sequences:
+        for aa in sequence:
+            aa_count[aa] += 1
+
+    objects = [*aa_count]
+    objects.sort()
+    y_pos = np.arange(len(objects))
+    performance = []
+    for aa in objects:
+        performance.append(aa_count[aa])
+
+    plt.bar(y_pos, performance, align="center", facecolor="blue", alpha=0.5)
+    plt.xticks(y_pos, objects)
+    plt.ylabel("Count")
+    plt.title("Amino Acid Count for Translated Proteins")
+    plt.savefig("aa_count_chart.png")
+    plt.clf()
+
+
+def generate_diagnostic_plots(db_manager):
+    dna_results = db_manager.exec_query("select dna_sequence from walk")
+
+    dna_lengths = map(lambda x: len(x[0]), dna_results)
+    _sequence_histogram(
+        list(dna_lengths), "DNA Sequence Lengths from FASTG", "fastg_seq_lengths.png"
+    )
+
+    protein_results = db_manager.exec_query("select sequence from protein")
+    protein_lengths = map(lambda x: len(x[0]), protein_results)
+    _sequence_histogram(
+        list(protein_lengths), "Protein Sequence Lengths", "protein_seq_lengths.png"
+    )
+
+    _gc_plot(
+        list(map(lambda x: x[0], dna_results)),
+        "GC Pct. - Translated FASTG",
+        "gc_pct.png",
+    )
+
+    # bar chart of all translated protein amino acids
+    _aa_barchart(list(map(lambda x: x[0], protein_results)))