From 6470d4a77023da7ab6fd12d0de1045477d977fa3 Mon Sep 17 00:00:00 2001
From: tmcgowan <mcgo0092@umn.edu>
Date: Tue, 21 Jul 2020 14:30:14 -0500
Subject: [PATCH] Initial commit

---
 .gitignore                        |  20 ++
 LICENSE.txt                       |  17 ++
 README.md                         |  34 +++
 fastg2protlib/__init__.py         |   0
 fastg2protlib/db.py               | 211 +++++++++++++++
 fastg2protlib/diagnostics.py      |  70 +++++
 fastg2protlib/fastg2protlib.py    | 423 ++++++++++++++++++++++++++++++
 fastg2protlib/mzid_sax_parsing.py |  63 +++++
 fastg2protlib/protein_score.py    | 248 ++++++++++++++++++
 requirements.txt                  |  27 ++
 setup.py                          |  28 ++
 tests/__init__.py                 |   0
 tests/data/two.fastg              |  40 +++
 tests/test_basic.py               |  18 ++
 14 files changed, 1199 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 LICENSE.txt
 create mode 100644 README.md
 create mode 100644 fastg2protlib/__init__.py
 create mode 100644 fastg2protlib/db.py
 create mode 100644 fastg2protlib/diagnostics.py
 create mode 100644 fastg2protlib/fastg2protlib.py
 create mode 100644 fastg2protlib/mzid_sax_parsing.py
 create mode 100644 fastg2protlib/protein_score.py
 create mode 100644 requirements.txt
 create mode 100644 setup.py
 create mode 100644 tests/__init__.py
 create mode 100644 tests/data/two.fastg
 create mode 100644 tests/test_basic.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3a7d4f2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,20 @@
+dist/
+build/
+fastg2protlib.egg-info/
+
+venv/
+.vscode/
+.pytest_cache/
+fastg2protlib/__pycache__/
+tests/__pycache__/
+.idea/
+*.db
+*.csv
+*.fasta
+*.mzid
+notes.txt
+run_*
+*.tabular
+
+.DS_Store
+application.py
\ No newline at end of file
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..3754756
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,17 @@
+MIT License
+Copyright (c) 2018 YOUR NAME
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b7a5558
--- /dev/null
+++ b/README.md
@@ -0,0 +1,34 @@
+# FASTG to Protein Library
+
+
+This package generates a candidate protein library in two phases:
+    
+1) Parsing a FASTG file to create graph traversals of longer stretches of DNA
+    
+    - FASTG is parsed into a directed graph. A depth-first search is made on all connecting
+            edges. The DFS traversal is then used to concatenate all DNA sequences in the path.
+    - DNA sequences are translated to mRNA and split into candidate proteins at the stop 
+            codon. Each DFS traversal can, and will, produce a set of candidate protein sequences.
+    - Protein sequences are filtered on length and amino acid redundancy. 
+    - Protein sequences are cleaved into peptide sequences.
+    - DFS traversals, proteins and peptides are stored in a SQLite database. The linking 
+            relationship between all three is maintained in the DB.
+    - A FASTA file of peptides is produced for the user. This FASTA file is to be used
+            in a search against MSMS data.
+
+2) Using verified peptides as a filter to produce a final candidate peptide library
+        
+    - The user will invoke the code with 
+        - DB
+        - list of peptide sequences or peptide FASTA
+        - It is expected that the submitted peptides have been verified against MSMS and
+            they represent found and identified peptide sequences
+    - The verified peptides are used to filter proteins from the database, these
+            proteins become the final library.
+    - The verified peptides are used to score the proteins for 
+        - coverage
+        - percent of verified v. total peptide association
+    - Final user output
+        - SQLite database
+        - Protein score text file, comma delimited
+        - Filtered protein FASTA file
\ No newline at end of file
diff --git a/fastg2protlib/__init__.py b/fastg2protlib/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/fastg2protlib/db.py b/fastg2protlib/db.py
new file mode 100644
index 0000000..f1177b2
--- /dev/null
+++ b/fastg2protlib/db.py
@@ -0,0 +1,211 @@
+import logging
+import os
+import sqlite3
+
+
+class DBManager:
+    def __init__(self, name="app.db"):
+        self.database_name = name
+        self.conn = sqlite3.connect(self.database_name)
+        self.cursor = self.conn.cursor()
+        self.logger = logging.getLogger(__name__)
+        self.logger.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter(
+            "%(asctime)s %(name)-12s %(levelname)-8s %(message)s"
+        )
+        handler.setFormatter(formatter)
+        self.logger.addHandler(handler)
+        # Build the tables
+        self.cursor.execute(
+            """CREATE TABLE IF NOT EXISTS walk
+                    (id INTEGER PRIMARY KEY, walk TEXT, dna_sequence TEXT)
+                """
+        )
+
+        self.cursor.execute(
+            """CREATE TABLE IF NOT EXISTS protein 
+                    (id INTEGER PRIMARY KEY, sequence TEXT)"""
+        )
+
+        self.cursor.execute(
+            """CREATE TABLE IF NOT EXISTS peptide
+                    (id INTEGER PRIMARY KEY, sequence TEXT)"""
+        )
+
+        self.cursor.execute(
+            """CREATE TABLE IF NOT EXISTS protein_to_walk
+                    (walk_id INTEGER, protein_id INTEGER)"""
+        )
+
+        self.cursor.execute(
+            """CREATE TABLE IF NOT EXISTS peptide_to_protein
+                    (peptide_id INTEGER, protein_id INTEGER)"""
+        )
+
+        self.conn.commit()
+
+    def exec_query(self, query):
+        return self.cursor.execute(query).fetchall()
+
+    def d_quote(self, str):
+        return '"' + str + '"'
+
+    def link_proteins_peptides(self, prot_peps):
+        """
+        Populate many-to-many table linking peptides to proteins
+        prot_pep object:
+        {
+            "prot_id": int,
+            "peptides": set()
+        }
+        """
+        self.logger.info("Begin link_proteins_peptides")
+        sql = sqlite3.connect(self.database_name)
+        sql.isolation_level = None
+        c = sql.cursor()
+        c.execute("begin")
+        try:
+            for obj in prot_peps:
+                protein_id = obj["prot_id"]
+                pep_seqs = []
+                for pep_seq in obj["peptides"]:
+                    pep_seqs.append(pep_seq)
+                in_clause = "(" + ",".join(list(map(self.d_quote, pep_seqs))) + ")"
+                sql_q = f"SELECT id FROM peptide WHERE sequence IN {in_clause}"
+                pep_ids = c.execute(sql_q).fetchall()
+                ins_data = []
+                for pep_id in pep_ids:
+                    ins_data.append((pep_id[0], protein_id))
+                sql_ins = f"INSERT INTO peptide_to_protein(peptide_id, protein_id) VALUES (?, ?)"
+                c.executemany(sql_ins, ins_data)
+            c.execute("commit")
+        except sql.Error:
+            self.logger.error(
+                "Transaction failed, rolling back in link_proteins_peptides"
+            )
+            c.execute("rollback")
+        self.logger.info("End link_proteins_peptides")
+
+    def insert_peptides(self, prot_peps):
+        """{
+            "prot_id": int,
+            "peptides": set()
+        }
+        """
+        self.logger.info(f"Begin Insert peptides")
+        sql = sqlite3.connect(self.database_name)
+        sql.isolation_level = None
+        c = sql.cursor()
+        c.execute("begin")
+
+        def data_iter(data):
+            """Filter out duplicate sequences
+            """
+            seen = set()
+            for item in data:
+                if item in seen:
+                    pass
+                else:
+                    seen.add(item)
+                    yield item
+
+        try:
+            pep_seqs = []
+            for pp in prot_peps:
+                for p in pp["peptides"]:
+                    pep_seqs.append((p,))
+            c.executemany(
+                "INSERT INTO peptide(sequence) VALUES (?)", data_iter(pep_seqs)
+            )
+            c.execute("commit")
+        except sql.Error:
+            self.logger.error("Transaction failed, rolling back in insert_peptides")
+            c.execute("rollback")
+        # Add index
+        self.logger.info("Creating peptide_sequence index")
+        c.execute("CREATE INDEX peptide_sequence ON peptide(sequence)")
+        self.link_proteins_peptides(prot_peps)
+        self.logger.info(f"End Insert peptides")
+
+    def build_links(self, objects):
+        """
+            Populate many-to-many for proteins to walk traversals
+        """
+        self.logger.info(f"Begin build_links")
+        sql = sqlite3.connect(self.database_name)
+        sql.isolation_level = None
+        c = sql.cursor()
+        c.execute("begin")
+        try:
+            for object in objects:
+                walk = ",".join(object["walk"])
+                w_s = f'select id from walk where walk = "{walk}"'
+                w_id = c.execute(w_s).fetchone()
+                prot_seqs = []
+                for p in object["proteins"]:
+                    prot_seqs.append(p)
+                in_clause = "(" + ",".join(list(map(self.d_quote, prot_seqs))) + ")"
+                sql_q = f"SELECT id FROM protein WHERE sequence IN {in_clause}"
+                prot_ids = c.execute(sql_q).fetchall()
+                ins_data = []
+                for prot_id in prot_ids:
+                    ins_data.append((w_id[0], prot_id[0]))
+                sql_ins = (
+                    "INSERT INTO protein_to_walk(walk_id, protein_id) VALUES (?,?)"
+                )
+                c.executemany(sql_ins, ins_data)
+            c.execute("commit")
+        except sql.Error:
+            self.logger.error("Transaction failed, rolling back in build_links")
+            c.execute("rollback")
+        self.logger.info(f"End build_links")
+
+    def insert_protein_walk_objects(self, objects):
+        self.logger.info(f"Begin insert_protein_walk_objects")
+        sql = sqlite3.connect(self.database_name)
+        sql.isolation_level = None
+        c = sql.cursor()
+        c.execute("begin")
+
+        def protein_sequence_iter(objects):
+            for object in objects:
+                for p in object["proteins"]:
+                    yield (p,)
+
+        def walk_sequence_iter(objects):
+            for object in objects:
+                walk = ",".join(object["walk"])
+                seq = object["dna_sequence"]
+                yield (walk, seq)
+
+        try:
+            c.executemany(
+                "INSERT INTO protein(sequence) VALUES (?)",
+                protein_sequence_iter(objects),
+            )
+            c.executemany(
+                "INSERT INTO walk (walk, dna_sequence) VALUES (?,?)",
+                walk_sequence_iter(objects),
+            )
+            c.execute("commit")
+        except sql.Error:
+            self.logger.error(
+                "Transaction failed, rolling back in insert_protein_walk_objects"
+            )
+            c.execute("rollback")
+        # Create indexes
+        self.logger.info("Creating indexes: walks_walk, walks_w_d and protein_sequence")
+        c.execute("CREATE INDEX walks_walk ON walk(walk)")
+        c.execute("CREATE INDEX protein_sequence ON protein(sequence)")
+        c.execute("CREATE INDEX walks_w_d ON walk(walk, dna_sequence)")
+
+        self.build_links(objects)
+        self.logger.info(f"End insert_protein_walk_objects")
+
+    def get_protein_sequences(self):
+        self.logger.info(f"Begin get_protein_sequences")
+        c = self.conn.execute("select id, sequence from protein")
+        r_val = [x for x in c]
+        self.logger.info(f"End get_protein_sequences")
+        return r_val
diff --git a/fastg2protlib/diagnostics.py b/fastg2protlib/diagnostics.py
new file mode 100644
index 0000000..d42605d
--- /dev/null
+++ b/fastg2protlib/diagnostics.py
@@ -0,0 +1,70 @@
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from Bio.SeqUtils import GC
+
+
+def _sequence_histogram(seq_lengths, title, file_name):
+    plt.hist(seq_lengths, 20, facecolor="blue", alpha=0.5)
+    plt.xlabel("Length")
+    plt.ylabel("Count")
+    plt.title(title)
+    plt.savefig(file_name)
+    plt.clf()
+
+
+def _gc_plot(sequences, title, file_name):
+    gc_values = sorted(GC(seq) for seq in sequences)
+    plt.plot(gc_values)
+    plt.title(title)
+    plt.xlabel("ORFs")
+    plt.ylabel("GC%")
+    plt.savefig(file_name)
+    plt.clf()
+
+
+def _aa_barchart(prot_sequences):
+    aa_count = defaultdict(int)
+    for sequence in prot_sequences:
+        for aa in sequence:
+            aa_count[aa] += 1
+
+    objects = [*aa_count]
+    objects.sort()
+    y_pos = np.arange(len(objects))
+    performance = []
+    for aa in objects:
+        performance.append(aa_count[aa])
+
+    plt.bar(y_pos, performance, align="center", facecolor="blue", alpha=0.5)
+    plt.xticks(y_pos, objects)
+    plt.ylabel("Count")
+    plt.title("Amino Acid Count for Translated Proteins")
+    plt.savefig("aa_count_chart.png")
+    plt.clf()
+
+
+def generate_diagnostic_plots(db_manager):
+    dna_results = db_manager.exec_query("select dna_sequence from walk")
+
+    dna_lengths = map(lambda x: len(x[0]), dna_results)
+    _sequence_histogram(
+        list(dna_lengths), "DNA Sequence Lengths from FASTG", "fastg_seq_lengths.png"
+    )
+
+    protein_results = db_manager.exec_query("select sequence from protein")
+    protein_lengths = map(lambda x: len(x[0]), protein_results)
+    _sequence_histogram(
+        list(protein_lengths), "Protein Sequence Lengths", "protein_seq_lengths.png"
+    )
+
+    _gc_plot(
+        list(map(lambda x: x[0], dna_results)),
+        "GC Pct. - Translated FASTG",
+        "gc_pct.png",
+    )
+
+    # bar chart of all translated protein amino acids
+    _aa_barchart(list(map(lambda x: x[0], protein_results)))
diff --git a/fastg2protlib/fastg2protlib.py b/fastg2protlib/fastg2protlib.py
new file mode 100644
index 0000000..3434172
--- /dev/null
+++ b/fastg2protlib/fastg2protlib.py
@@ -0,0 +1,423 @@
+from Bio.Seq import Seq
+from collections import defaultdict
+import datetime
+import logging
+import networkx as nx
+import os
+from pyteomics import parser
+import re
+import sys
+
+from fastg2protlib.diagnostics import generate_diagnostic_plots
+from fastg2protlib.db import DBManager
+from fastg2protlib.protein_score import score_proteins, score_tabular
+
+logger = logging.getLogger(__name__)
+logger.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
+handler = logging.StreamHandler()
+formatter = logging.Formatter("%(asctime)s %(name)-12s %(levelname)-8s %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def _parse_edges(edge_name, edge_str):
+    """
+    EDGE_71_length_1904_cov_38.3193,EDGE_72_length_57_cov_29.75
+
+    becomes [(node_name, '71+'), ('71+', '72+')]
+    :param edge_name:
+    :param edge_str:
+    :return: list of tuples
+    """
+    r_val = []
+
+    all_edges = edge_str.split(",")
+    leading_edge = edge_name
+    for e in all_edges:
+        rc = False
+        if e.endswith("'"):
+            rc = True
+        regex = re.compile(r"(?:EDGE|NODE)_(?P<node_name>[a-zA-Z\d]+?)_length_")
+        m = regex.search(e)
+        node_name = m.group("node_name")
+        if rc:
+            node_name += "-"
+        else:
+            node_name += "+"
+        r_val.append((leading_edge, node_name))
+        leading_edge = node_name
+    return r_val
+
+
+def _parse_fastg_header(header):
+    # region
+    """
+    Parse FASTG header and return
+        * node_name: '1+'
+        * length: 449
+        * coverage: 82.5991
+        * edges: [('1+', '71+'), ('71+', '72+')]
+        * sequence: None
+        * warnings: [list of warning message, if any]
+
+    >EDGE_1_length_449_cov_82.5991:EDGE_71_length_1904_cov_38.3193,EDGE_72_length_57_cov_29.75;
+
+    :param header:
+    :return:
+    """
+    # endregion
+    keys = ["node_name", "length", "coverage", "edges", "sequence", "warnings"]
+    obj = dict.fromkeys(keys)
+
+    header = header.lstrip(">").rstrip(";")
+    num_colons = sum([1 for x in header if x == ":"])
+    if num_colons > 1:
+        obj["warnings"] = ["Too many colons in header, not in FASTG file format"]
+
+    rev_compl = False
+    chunks = header.split(":")
+
+    if chunks[0].endswith("'"):
+        rev_compl = True
+    regex = re.compile(
+        r"(?:EDGE|NODE)_(?P<node>[a-zA-Z\d]+?)_length_(?P<length>\d+?)_cov_(?P<cov>[\d|\.]+)"
+    )
+    m = regex.search(chunks[0])
+    if m is None:
+        obj["warnings"].append("Regex parsing failed")
+    node_name = m.group("node")
+    if rev_compl:
+        node_name += "-"
+    else:
+        node_name += "+"
+    obj["node_name"] = node_name
+    obj["length"] = m.group("length")
+    obj["coverage"] = m.group("cov")
+    if len(chunks) > 1:
+        obj["edges"] = _parse_edges(node_name, chunks[1])
+    else:
+        obj["edges"] = None
+    obj["sequence"] = ""
+
+    return obj
+
+
+def _expand_walk(walks):
+    # region
+    """
+
+    We expand existing depth-first traversals here.
+    A node with multiple predecessors has multiple full length traversals.
+    This function splices together existing walks where a node is traversed more than once.
+
+    Parameters
+    ----------
+    walks: list of DFS edge traversals
+
+    Returns
+    -------
+    expanded_walks: list of DFS edge traversals expanded with longer paths.
+    """
+    # endregion
+    expanded_walks = []
+    walk_starts = {}
+    for i, walk in enumerate(walks):
+        walk_starts[walk[0]] = i
+
+    for walk in walks:
+        path = []
+        for i, node in enumerate(walk):
+            if node in walk_starts and i > 0:
+                if node != walk[0]:
+                    p = path
+                    p.extend(walks[walk_starts[node]])
+                    expanded_walks.append(p)
+                    path = []
+            path.append(node)
+    return expanded_walks
+
+
+def _generate_sequence(G, graph_path):
+    """
+    G.nodes['1+']
+    :param graph_path:
+    :return:
+    """
+    dna_seq = ""
+    for node_name in graph_path:
+        dna_seq += G.nodes[node_name]["sequence"]
+    return dna_seq
+
+
+def _translate_dna(seq, min_length=150):
+    def frame_translation(sequence):
+        frame_sequences = []
+        # translate for each frame
+        for frame_start in range(3):
+            frame_seq = sequence[frame_start:]
+            rem = len(frame_seq) % 3
+            if rem == 1:
+                frame_seq += "NN"
+            if rem == 2:
+                frame_seq += "N"
+
+            mrna = Seq(frame_seq)
+            t_seq = mrna.translate()
+            frame_sequences.append(f"{t_seq}")
+        return frame_sequences
+
+    def filter_protein(p):
+        redundancy_pct = 0.80
+        if len(p) < min_length:
+            return False
+        d = defaultdict(int)
+        for aa in p:
+            d[aa] += 1
+        if max(d.values()) / len(p) > redundancy_pct:
+            return False
+        return True
+
+    translated_sequences = frame_translation(seq)
+    rev_compl = Seq(seq).reverse_complement()
+    translated_sequences.extend(frame_translation(f"{rev_compl}"))
+
+    return_list = []
+    for a_sequence in translated_sequences:
+        return_list.extend(list(filter(filter_protein, a_sequence.split("*"))))
+    return return_list
+
+
+def _extract_continuous_traversals(graph):
+    # region
+    """
+        Traversal Extraction
+        --------------------
+        Given a list of depth-first-search edges, find and extract all
+        continuous traversals.
+
+    :param dfs_edge_list: a list of depth-first-search edges
+    :return: list of traversals
+    """
+    # endregion
+    logger.info("Begin extracting traversals")
+    ret_val = []
+    dfs_edge_list = nx.edge_dfs(graph)
+    walk = []
+    for edge in dfs_edge_list:
+        if len(walk) == 0:
+            walk.extend([x for x in edge])
+        elif edge[0] == walk[-1]:
+            walk.append(edge[1])
+        else:
+            ret_val.append(walk)
+            walk = []
+            walk.extend([x for x in edge])
+    if len(walk) > 0:
+        ret_val.append(walk)
+    _expand_walk(ret_val)
+    logger.info("Finished extracting traversals")
+    return ret_val
+
+
+def _add_node(graph, node):
+    graph.add_node(
+        node["node_name"],
+        length=node["length"],
+        coverage=node["coverage"],
+        sequence=node["sequence"],
+    )
+    if node["edges"] is not None:
+        graph.add_edges_from(node["edges"])
+
+
+def _generate_graph(fastg_fname):
+    # region
+    """
+        Generates a digraph from the FASTG file provided.
+        The FASTG file must be in SPADES format.
+        
+        :param fastg_fname: FASTG filename as a full path
+        :return: DiGraph based on FASTG edge/node data 
+    """
+    # endregion
+    g = nx.DiGraph()
+    with open(fastg_fname, "r") as f:
+        node = {}
+        for line in f:
+            line = line.strip()
+            if line.startswith(">"):
+                if len(node) > 0:
+                    _add_node(g, node)
+                    node = {}
+                node = _parse_fastg_header(line)
+            else:
+                node["sequence"] += line.strip()
+    _add_node(g, node)
+    return g
+
+
+def _proteins_for_walk(graph, walk, min_length=150):
+    obj = {
+        "walk": walk,
+        "dna_sequence": _generate_sequence(graph, walk),
+        "proteins": [],
+    }
+    obj["proteins"] = _translate_dna(obj["dna_sequence"], min_length=min_length)
+    return obj
+
+
+def _protein_digest(proteins, enz_cleavage="trypsin", min_length=10):
+    # (1, 'LTQQNKTIFRLLIAALIIL...EFQLMEAVE') <- p
+    prot_pep = []
+    for p in proteins:
+        o = {
+            "prot_id": p[0],
+            "peptides": parser.cleave(
+                p[1], parser.expasy_rules[enz_cleavage], min_length=min_length
+            ),
+        }
+        prot_pep.append(o)
+    return prot_pep
+
+
+def format_pep_head(item):
+    header = ">Pep_"
+    header += str(item[0]) + "|Protein_"
+    prots = item[1].split(",")
+    header += "_".join(prots)
+    return header
+
+
+def write_peptide_fasta(db, fasta_name="peptide.fasta"):
+    # (1, '1,1,600')
+    logger.info("Executing select query for peptide writing.")
+    pep_prot = db.exec_query(
+        "SELECT peptide_id, GROUP_CONCAT(protein_id) FROM peptide_to_protein GROUP BY peptide_id"
+    )
+    logger.info("Writing output")
+    with open(fasta_name, "w") as f:
+        for item in pep_prot:
+            s = db.exec_query("SELECT sequence FROM peptide WHERE id = " + str(item[0]))
+            f.write(format_pep_head(item))
+            f.write(os.linesep)
+            f.write(s[0][0])
+            f.write(os.linesep)
+
+
+def peptides_for_fastg(
+    fastg_filename,
+    cleavage="trypsin",
+    min_protein_length=166,
+    min_peptide_length=10,
+    db_name="tst.db",
+    peptide_file_name="peptide.fasta",
+    create_plots=False
+):
+    # region
+    """
+        FASTG to Protein Library
+        ------------------------
+
+        This package generates a candidate protein library in two phases:
+
+            1) Parsing a FASTG file to create graph traversals of longer stretches of DNA
+                - FASTG is parsed into a directed graph. A depth-first search is made on all connecting
+                    edges. The DFS traversal is then used to concatenate all DNA sequences in the path.
+                - DNA sequences are translated to mRNA and split into candidate proteins at the stop 
+                    codon. Each DFS traversal can, and will, produce a set of candidate protein sequences.
+                - Protein sequences are filtered on length and amino acid redundancy. 
+                - Protein sequences are cleaved into peptide sequences.
+                - DFS traversals, proteins and peptides are stored in a SQLite database. The linking 
+                    relationship between all three is maintained in the DB.
+                - A FASTA file of peptides is produced for the user. This FASTA file is to be used
+                    in a search against MSMS data.
+            2) Using verified peptides as a filter to produce a final candidate peptide library
+                - The user will invoke the code with 
+                    - DB
+                    - list of peptide sequences or peptide FASTA
+                - It is expected that the submitted peptides have been verified against MSMS and
+                    they represent found and identified peptide sequences
+                - The verified peptides are used to filter proteins from the database, these
+                    proteins become the final library.
+                - The verified peptides are used to score the proteins for 
+                    - coverage
+                    - percent of verified v. total peptide association
+                - Final user output
+                    - SQLite database
+                    - Protein score text file, comma delimited
+                    - Filtered protein FASTA file
+
+    """
+    # endregion
+
+    logger.info(f"FASTG file: {fastg_filename}")
+    logger.info(f"Protein len: {min_protein_length}")
+    logger.info(f"Peptide len: {min_peptide_length}")
+    logger.info(f"Cleavage: {cleavage}")
+    logger.info(f"Database Name: {db_name}")
+    logger.info(f"============================================")
+
+    logger.info("Begin _generate_graph")
+    g = _generate_graph(fastg_filename)
+    logger.info("End _generate_graph")
+
+    walks = _extract_continuous_traversals(g)
+    protein_walk_objs = []
+
+    logger.info("Begin walks")
+    no_protein_count = 0
+    for walk in walks:
+        obj = _proteins_for_walk(g, walk, min_length=min_protein_length)
+        if len(obj['proteins']) > 0:
+            protein_walk_objs.append(obj)
+        else:
+            no_protein_count += 1
+    logger.info("End walks")
+    logger.info(f"There were {no_protein_count} walks with no protein sequences.")
+
+    logger.info("Clearing graph object from memory")
+    g.clear()
+
+    db = DBManager(db_name)
+    db.insert_protein_walk_objects(protein_walk_objs)
+
+    proteins = db.get_protein_sequences()
+    db.insert_peptides(
+        _protein_digest(proteins, enz_cleavage=cleavage, min_length=min_peptide_length)
+    )
+
+    # Write peptide fasta
+    write_peptide_fasta(db, fasta_name=peptide_file_name)
+
+    # diagnostics
+    if create_plots:
+        logger.info("Generating diagnostic plots")
+        generate_diagnostic_plots(db)
+    logger.info("Run is complete")
+
+
+def verified_proteins(
+    tabular_file, fdr_level=0.10, decoy_header="XXX_", db_name="tst.db"
+):
+    # region
+    """
+        Purpose
+        -------
+        User has searched peptides against MSMS data. Now, we will produce
+        a FASTA library of proteins based on the verified peptides from the search
+
+        Input
+        ----------
+        MSGF+ tabular output file.
+
+        db_name: DB name used in production of peptide_file
+
+        Output
+        -------
+        A protein fasta file is written to disk
+
+        A protein score text file is written to disk
+
+    """
+    # endregion
+    score_tabular(tabular_file, fdr_level, decoy_header, db_name)
diff --git a/fastg2protlib/mzid_sax_parsing.py b/fastg2protlib/mzid_sax_parsing.py
new file mode 100644
index 0000000..4ecec6c
--- /dev/null
+++ b/fastg2protlib/mzid_sax_parsing.py
@@ -0,0 +1,63 @@
+import operator
+import xml.sax
+
+
+class msgfContentHandler(xml.sax.ContentHandler):
+    def __init__(self, score_config):
+        self.sii = dict()
+        self.in_sii = False
+        self.current_sii = None
+        self.peptides = dict()
+        self.in_peptide = False
+        self.current_peptide = None
+        self.in_pep_seq = False
+        self.comparison_operator = score_config["comparison"]
+        self.score_threshold = score_config["score_threshold"]
+        self.score_name = score_config["score_name"]
+        self.score_function = operator.attrgetter(score_config["comparison"])
+
+    def characters(self, chars):
+        if self.in_pep_seq:
+            self.peptides[self.current_peptide] = chars
+
+    def startElement(self, name, attrs):
+        if name == "Peptide":
+            self.peptides[attrs["id"]] = None
+            self.in_peptide = True
+            self.current_peptide = attrs["id"]
+
+        if name == "PeptideSequence" and self.in_peptide:
+            self.in_pep_seq = True
+
+        if name == "SpectrumIdentificationItem":
+            if attrs["rank"] == "1":
+                self.in_sii = True
+                self.current_sii = {"peptide_ref": attrs["peptide_ref"]}
+
+        if self.in_sii and name == "cvParam":
+            if attrs["name"] == self.score_name:
+                if self.score_function(operator)(
+                    float(attrs["value"]), self.score_threshold
+                ):
+                    self.current_sii[self.score_name] = attrs["value"]
+
+    def endElement(self, name):
+        if name == "SpectrumIdentificationItem" and self.in_sii:
+            self.in_sii = False
+            self.sii[self.current_sii["peptide_ref"]] = self.current_sii
+            self.current_sii = None
+        if name == "PeptideSequence":
+            self.in_pep_seq = False
+        if name == "Peptide":
+            self.in_peptide = False
+
+
+def parse_mzid(mzid_file, score_config):
+    verified_peptides = set()
+    parser = xml.sax.make_parser()
+    ch = msgfContentHandler(score_config)
+    parser.setContentHandler(ch)
+    parser.parse(mzid_file)
+    for peptide_ref in ch.sii.keys():
+        verified_peptides.add(ch.peptides[peptide_ref])
+    return verified_peptides
diff --git a/fastg2protlib/protein_score.py b/fastg2protlib/protein_score.py
new file mode 100644
index 0000000..911566f
--- /dev/null
+++ b/fastg2protlib/protein_score.py
@@ -0,0 +1,248 @@
+import logging
+import os
+import operator
+from pyteomics import parser, mzid
+import re
+import sqlite3
+import sys
+
+from fastg2protlib.mzid_sax_parsing import parse_mzid
+
+logger = logging.getLogger(__name__)
+logger.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
+handler = logging.StreamHandler()
+formatter = logging.Formatter("%(asctime)s %(name)-12s %(levelname)-8s %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+session_db_name = None
+
+
+class ProteinScore:
+    def __init__(self):
+        self.protein_id = None
+        self.pct_pep_coverage = None
+        self.pct_sequence_coverage = None
+        self.verified_peptides = None
+
+    def __format__(self, formatstr):
+        return f"ID: {self.protein_id} Peptide Coverage by Count: {self.pct_pep_coverage} Protein Sequence Covered: {self.pct_sequence_coverage}"
+
+
+def _query_db(sql):
+    global session_db_name
+    conn = sqlite3.connect(session_db_name)
+    cursor = conn.cursor()
+    rs = cursor.execute(sql)
+    return rs
+
+
+def _proteins_for_peptide(pep_id):
+    """
+    peptide_to_protein
+                    (peptide_id INTEGER, protein_id INTEGER);
+    """
+    rs = _query_db(
+        "SELECT protein_id FROM peptide_to_protein WHERE peptide_id = " + str(pep_id)
+    )
+    proteins = []
+    for r in rs.fetchall():
+        proteins.append(r[0])
+    return (pep_id, proteins)
+
+
+def _pepseq_mapping(peptides):
+    """
+        Uses a set of peptide sequences to return
+        a list of tuples:
+        (peptide id, list of protein ids the peptide sequence is part of)
+    """
+    sql_in = ",".join(map('"{0}"'.format, peptides))
+
+    rs = _query_db("SELECT * FROM peptide WHERE peptide.sequence IN (" + sql_in + ")")
+    ret_val = []
+    for r in rs.fetchall():
+        ret_val.append(_proteins_for_peptide(r[0]))
+    return ret_val
+
+
+def _protein_sequence_coverage(p_dict):
+    """
+    Return a list of ProteinScore objects. Coverage is filled out.
+    """
+    ret_val = []
+    for k, v in p_dict.items():
+        in_clause = "(" + ",".join(map(lambda x: str(x), v)) + ")"
+        pep_rs = _query_db("select sequence from peptide where id in " + in_clause)
+        pep_sequences = list(map(lambda x: x[0], pep_rs.fetchall()))
+        prot_rs = _query_db("select sequence from protein where id = " + str(k))
+        prot_seq = prot_rs.fetchone()[0]
+        ps = ProteinScore()
+        ps.verified_peptides = v
+        ps.protein_id = k
+        ps.pct_sequence_coverage = round(parser.coverage(prot_seq, pep_sequences), 3)
+        ret_val.append(ps)
+    return ret_val
+
+
+def _protein_dict(pep_lst):
+    # region
+    """
+    Create a dictionary:
+        key = protein id
+        value = list of peptides associated with protein    )
+
+    Dictionary gives us the raw count of unique peptide count for each protein.
+
+    Parameters
+    ----------
+    pep_lst
+
+    Returns
+    -------
+    dictionary
+
+    """
+    # endregion
+    protein_dict = dict()
+    for peptide_id, proteins in pep_lst:
+        for protein_id in proteins:
+            p_id = int(protein_id)
+            if p_id not in protein_dict:
+                protein_dict[p_id] = []
+            protein_dict[p_id].append(int(peptide_id))
+    return protein_dict
+
+
+def _peptide_pct_coverage(lst):
+    # sqlite> select protein_id, GROUP_CONCAT(peptide_id)  from peptide_to_protein where protein_id =323;
+    # 323|283,327,334,338,354,2305,2821,2822,2824,3070,3590,3591,3592,3593,3594,3693,3702,4023,6636
+    for ps in lst:
+        sql = f"SELECT protein_id, GROUP_CONCAT(peptide_id) FROM peptide_to_protein WHERE protein_id = {ps.protein_id}"
+        rs = _query_db(sql)
+        ps.pct_pep_coverage = round(
+            len(ps.verified_peptides) / len(rs.fetchall()[0][1]), 3
+        )
+
+
+def _read_mzid(mzid_file, score_config):
+    logger.info(f"Starting to read MZIdentML file {mzid_file}")
+
+    verified_peptides = parse_mzid(mzid_file, score_config)
+
+    logger.info(f"Finished with the MZIdentML file {mzid_file}")
+    return verified_peptides
+
+
+def _write_protein_scores(p_scores):
+    with open("protein_scores.csv", "w") as f:
+        f.write("protein_id,pct_sequence_coverage,pct_peptide_coverage")
+        f.write(os.linesep)
+        for ps in p_scores:
+            f.write(f"{ps.protein_id},{ps.pct_sequence_coverage},{ps.pct_pep_coverage}")
+            f.write(os.linesep)
+
+
+def _write_fasta(p_scores):
+    with open("protein.fasta", "w") as f:
+        for ps in p_scores:
+            s = f"SELECT sequence, start, stop, strand FROM protein WHERE id = {ps.protein_id}"
+            rs = _query_db(s)
+            results = rs.fetchone()
+            sequence = results[0]
+            start_offset = results[1]
+            stop_offset = results[2]
+            strand_id = results[3]
+            rs = _query_db(
+                f"SELECT walk FROM walk WHERE id IN (SELECT walk_id FROM protein_to_walk WHERE protein_id = {ps.protein_id})"
+            )
+            f.write(
+                f">Protein_{ps.protein_id}|{rs.fetchone()[0]}_start:{start_offset}_stop:{stop_offset}_strand:{strand_id}"
+            )
+            f.write(os.linesep)
+            f.write(f"{sequence}")
+            f.write(os.linesep)
+
+
+def score_tabular(tabular_file, fdr_level, decoy_header, db_name):
+    # region
+    """
+    Use tabular MSGF+ output to score the proteins via FDR.
+    """
+    # endregion
+    global session_db_name
+    session_db_name = db_name
+
+    fwd_id = 0
+    rev_id = 0
+
+    peptide_sequences = set()
+    regex = re.compile(r"[^A-Za-z]")
+
+    with open(tabular_file) as f:
+        header = f.readline().strip().split("\t")
+        for line in f:
+            values = line.strip().split("\t")
+            psm = dict(zip(header, values))
+            if psm["Protein"].startswith(decoy_header):
+                rev_id += 1.0
+            else:
+                fwd_id += 1.0
+            if (float(rev_id) / fwd_id) > fdr_level:
+                break
+            else:
+                # Add peptide sequence, remove modification value
+                peptide_sequences.add(regex.sub("", psm["Peptide"]))
+    logger.info(
+        f"Generated {len(peptide_sequences)} unique peptide sequences at FDR <= {fdr_level}"
+    )
+    proteins_to_peptides = _pepseq_mapping(peptide_sequences)
+
+    protein_dict = _protein_dict(proteins_to_peptides)
+    protein_scores = _protein_sequence_coverage(protein_dict)
+    _peptide_pct_coverage(protein_scores)
+    _write_protein_scores(protein_scores)
+    _write_fasta(protein_scores)
+    logger.info("Completed scoring using MSGF+ tabular output")
+
+
+"""
+{
+    '#SpecFile': 'T4A.mzML', 'SpecID': 'index=262484', 'ScanNum': '-1', 
+    'FragMethod': 'CID', 'Precursor': '1111.6', 'IsotopeError': '1', 
+    'PrecursorError(ppm)': '5.181823', 'Charge': '3', 'Peptide': 'FDIPIIQLQPPFVVDLLDSNIAVFGAAM+15.995SGK', 
+    'Protein': 'Pep_1590|Protein_133(pre=-,post=-)', 'DeNovoScore': '160', 'MSGFScore': '-61', 
+    'SpecEValue': '0.025976893', 'EValue': '6830.0264', 'QValue': '0.4996623', 'PepQValue': '0.80702245'}
+"""
+
+
+def score_proteins(mzid_file, score_object, db_name="tst.db"):
+    # region
+    """
+        Purpose
+        -------
+        User has searched peptides against MSMS data. Now, we will produce
+        a FASTA library of proteins based on the verified peptides from the search
+
+        Inputs
+        ------
+        MzIdentML file.
+
+        Return
+        ------
+        Protein FASTA file.
+        
+    """
+    # endregion
+    global session_db_name
+    session_db_name = db_name
+
+    verified_peptides = _read_mzid(mzid_file, score_object)
+    proteins_to_peptides = _pepseq_mapping(verified_peptides)
+
+    protein_dict = _protein_dict(proteins_to_peptides)
+    protein_scores = _protein_sequence_coverage(protein_dict)
+    _peptide_pct_coverage(protein_scores)
+    _write_protein_scores(protein_scores)
+    _write_fasta(protein_scores)
+    logger.info("Completed scoring")
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..08a15fc
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,27 @@
+appdirs==1.4.3
+attrs==19.3.0
+biopython==1.76
+black==19.10b0
+click==7.1.1
+cycler==0.10.0
+decorator==4.4.2
+kiwisolver==1.2.0
+lxml==4.5.0
+matplotlib==3.2.2
+more-itertools==8.4.0
+networkx==2.4
+numpy==1.18.2
+packaging==20.4
+pathspec==0.7.0
+pluggy==0.13.1
+py==1.9.0
+pyparsing==2.4.7
+pyteomics==4.2
+pytest==5.4.3
+pytest-datadir==1.3.1
+python-dateutil==2.8.1
+regex==2020.2.20
+six==1.15.0
+toml==0.10.0
+typed-ast==1.4.1
+wcwidth==0.2.5
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..4de0952
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,28 @@
+import setuptools
+
+_pkg_version = "1.0.0"
+_author = "Thomas McGowan"
+_author_email = "mcgo0092@umn.edu"
+_license = "MIT"
+_repo = "https://github.com/galaxyproteomics/fastg2protlib"
+
+
+setuptools.setup(
+    name="fastg2protlib",
+    version=f"{_pkg_version}",
+    packages=["fastg2protlib"],
+    install_requires=["biopython", "lxml", "networkx", "numpy", "pyteomics", "matplotlib"],
+    description="FASTG sequences to a protein library",
+    author=f"{_author}",
+    author_email=f"{_author_email}",
+    url="f{_repo}",
+    download_url=f"{_repo}/archive/v_{_pkg_version}.tar.gz",
+    license=f"{_license}",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+        "Intended Audience :: Science/Research",
+    ],
+    python_requires=">=3.6",
+)
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/two.fastg b/tests/data/two.fastg
new file mode 100644
index 0000000..f29abe3
--- /dev/null
+++ b/tests/data/two.fastg
@@ -0,0 +1,40 @@
+>EDGE_1_length_84_cov_1.0:EDGE_3_length_84_cov_1.0;
+CGTTATTCGCGCCCACTCTCCCATTTATCCGCGCAAGCGGATGCGATGCGATTGCCCGCTAAGATATTCTTACCATTCTCGACA
+>EDGE_1_length_84_cov_1.0';
+TGTCGAGAATGGTAAGAATATCTTAGCGGGCAATCGCATCGCATCCGCTTGCGCGGATAAATGGGAGAGTGGGCGCGAATAACG
+>EDGE_2_length_84_cov_1.0:EDGE_3_length_84_cov_1.0;
+CTGGTCCTGTTGACTACAATGGGCCCAACTCAATCACAGCTCGAGCGCCTTGAATAACATACTCATCTCTATACATTCTCGACA
+>EDGE_2_length_84_cov_1.0':EDGE_3_length_84_cov_1.0';
+TGTCGAGAATGTATAGAGATGAGTATGTTATTCAAGGCGCTCGAGCTGTGATTGAGTTGGGCCCATTGTAGTCAACAGGACCAG
+>EDGE_3_length_84_cov_1.0:EDGE_2_length_84_cov_1.0,EDGE_4_length_84_cov_1.0;
+CATTCTCGACATGCTGAGCTGAGACGGCGTCGATGCATAGCGGACTTTCGGTCAGTCGCAATTCCTCACGAGACTGGTCCTGTT
+>EDGE_3_length_84_cov_1.0':EDGE_2_length_84_cov_1.0',EDGE_1_length_84_cov_1.0';
+AACAGGACCAGTCTCGTGAGGAATTGCGACTGACCGAAAGTCCGCTATGCATCGACGCCGTCTCAGCTCAGCATGTCGAGAATG
+>EDGE_4_length_84_cov_1.0:EDGE_5_length_84_cov_1.0;
+CTGGTCCTGTTACAGAGCTGGCGTACGCGTTGAACACTTCACAGATGATAGGGATTCGGGTAAAGAGCGTGTCATTGGGGGCTT
+>EDGE_4_length_84_cov_1.0':EDGE_3_length_84_cov_1.0';
+AAGCCCCCAATGACACGCTCTTTACCCGAATCCCTATCATCTGTGAAGTGTTCAACGCGTACGCCAGCTCTGTAACAGGACCAG
+>EDGE_5_length_84_cov_1.0;
+ATTGGGGGCTTCATACATAGAGCAAGGGCGTCGAACGGTCGTGAAAGTCTTAGTACCGCACGTACCAACTTACTGAGGATATTG
+>EDGE_5_length_84_cov_1.0':EDGE_4_length_84_cov_1.0',EDGE_6_length_84_cov_1.0';
+CAATATCCTCAGTAAGTTGGTACGTGCGGTACTAAGACTTTCACGACCGTTCGACGCCCTTGCTCTATGTATGAAGCCCCCAAT
+>EDGE_6_length_84_cov_1.0:EDGE_5_length_84_cov_1.0;
+AAGAGGCCGCCACCGTTTTAGGGGGGGAAGGTTGAAGATCTCCTCTTCTCATGACTGAACTCGCGAGGGCCGTATTGGGGGCTT
+>EDGE_6_length_84_cov_1.0':EDGE_8_length_84_cov_1.0';
+AAGCCCCCAATACGGCCCTCGCGAGTTCAGTCATGAGAAGAGGAGATCTTCAACCTTCCCCCCCTAAAACGGTGGCGGCCTCTT
+>EDGE_7_length_84_cov_1.0:EDGE_8_length_84_cov_1.0;
+AAGAGGCCGCCAAAGAACAAAGGCTTACTGTGCGCAGAGGAACGCCCATTTAGCGGCTGGCGTTTTGAATCCTTTTAATATTGT
+>EDGE_7_length_84_cov_1.0':EDGE_8_length_84_cov_1.0';
+ACAATATTAAAAGGATTCAAAACGCCAGCCGCTAAATGGGCGTTCCTCTGCGCACAGTAAGCCTTTGTTCTTTGGCGGCCTCTT
+>EDGE_8_length_84_cov_1.0:EDGE_7_length_84_cov_1.0,EDGE_6_length_84_cov_1.0;
+TTTAATATTGTTTAATCCAATTCCCTCATTTAGGACCCTACCAAGTCAACATTGGTATATGAATGCGACCTCGAAGAGGCCGCC
+>EDGE_8_length_84_cov_1.0':EDGE_7_length_84_cov_1.0',EDGE_9_length_84_cov_1.0';
+GGCGGCCTCTTCGAGGTCGCATTCATATACCAATGTTGACTTGGTAGGGTCCTAAATGAGGGAATTGGATTAAACAATATTAAA
+>EDGE_9_length_84_cov_1.0:EDGE_8_length_84_cov_1.0;
+TAAAAATGACAGTGGTTGGTGCTCTAAACTTCATTTGGTTAACTCGTGTATCAGCGCGATAGGCTGTTAGAGGTTTAATATTGT
+>EDGE_9_length_84_cov_1.0';
+ACAATATTAAACCTCTAACAGCCTATCGCGCTGATACACGAGTTAACCAAATGAAGTTTAGAGCACCAACCACTGTCATTTTTA
+>EDGE_10_length_84_cov_1.0;
+ATGGCAAGGTACTTCCGGTCTTAATGAATGGCCGGGAAAGGTACGCACGCGGTATGGGGGGGTGAAGGGGCGAATAGACAGGCT
+>EDGE_10_length_84_cov_1.0':EDGE_10_length_84_cov_1.0;
+AGCCTGTCTATTCGCCCCTTCACCCCCCCATACCGCGTGCGTACCTTTCCCGGCCATTCATTAAGACCGGAAGTACCTTGCCAT
diff --git a/tests/test_basic.py b/tests/test_basic.py
new file mode 100644
index 0000000..6e9c068
--- /dev/null
+++ b/tests/test_basic.py
@@ -0,0 +1,18 @@
+from fastg2protlib import fastg2protlib
+import os
+
+
+def test_peptides_for_fastg(shared_datadir, tmpdir):
+    contents = (shared_datadir / 'two.fastg').read_text()
+    sub = tmpdir.mkdir("sub")
+    p = sub.join("fastg_input")
+    p.write(contents)
+    
+    fastg2protlib.peptides_for_fastg(
+        os.path.join(sub, "fastg_input"), 
+        min_protein_length=40, 
+        db_name=os.path.join(sub, "test.db"),
+        peptide_file_name=os.path.join(sub, "peptide.fasta"))
+    with open(os.path.join(sub, "peptide.fasta")) as f:
+        peps = f.readlines()
+    assert len(peps) == 68
\ No newline at end of file