From 6470d4a77023da7ab6fd12d0de1045477d977fa3 Mon Sep 17 00:00:00 2001 From: tmcgowan Date: Tue, 21 Jul 2020 14:30:14 -0500 Subject: [PATCH] Initial commit --- .gitignore | 20 ++ LICENSE.txt | 17 ++ README.md | 34 +++ fastg2protlib/__init__.py | 0 fastg2protlib/db.py | 211 +++++++++++++++ fastg2protlib/diagnostics.py | 70 +++++ fastg2protlib/fastg2protlib.py | 423 ++++++++++++++++++++++++++++++ fastg2protlib/mzid_sax_parsing.py | 63 +++++ fastg2protlib/protein_score.py | 248 ++++++++++++++++++ requirements.txt | 27 ++ setup.py | 28 ++ tests/__init__.py | 0 tests/data/two.fastg | 40 +++ tests/test_basic.py | 18 ++ 14 files changed, 1199 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE.txt create mode 100644 README.md create mode 100644 fastg2protlib/__init__.py create mode 100644 fastg2protlib/db.py create mode 100644 fastg2protlib/diagnostics.py create mode 100644 fastg2protlib/fastg2protlib.py create mode 100644 fastg2protlib/mzid_sax_parsing.py create mode 100644 fastg2protlib/protein_score.py create mode 100644 requirements.txt create mode 100644 setup.py create mode 100644 tests/__init__.py create mode 100644 tests/data/two.fastg create mode 100644 tests/test_basic.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3a7d4f2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,20 @@ +dist/ +build/ +fastg2protlib.egg-info/ + +venv/ +.vscode/ +.pytest_cache/ +fastg2protlib/__pycache__/ +tests/__pycache__/ +.idea/ +*.db +*.csv +*.fasta +*.mzid +notes.txt +run_* +*.tabular + +.DS_Store +application.py \ No newline at end of file diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..3754756 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,17 @@ +MIT License +Copyright (c) 2018 YOUR NAME +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..b7a5558 --- /dev/null +++ b/README.md @@ -0,0 +1,34 @@ +# FASTG to Protein Library + + +This package generates a candidate protein library in two phases: + +1) Parsing a FASTG file to create graph traversals of longer stretches of DNA + + - FASTG is parsed into a directed graph. A depth-first search is made on all connecting + edges. The DFS traversal is then used to concatenate all DNA sequences in the path. + - DNA sequences are translated to mRNA and split into candidate proteins at the stop + codon. Each DFS traversal can, and will, produce a set of candidate protein sequences. + - Protein sequences are filtered on length and amino acid redundancy. + - Protein sequences are cleaved into peptide sequences. + - DFS traversals, proteins and peptides are stored in a SQLite database. The linking + relationship between all three is maintained in the DB. + - A FASTA file of peptides is produced for the user. This FASTA file is to be used + in a search against MSMS data. + +2) Using verified peptides as a filter to produce a final candidate peptide library + + - The user will invoke the code with + - DB + - list of peptide sequences or peptide FASTA + - It is expected that the submitted peptides have been verified against MSMS and + they represent found and identified peptide sequences + - The verified peptides are used to filter proteins from the database, these + proteins become the final library. + - The verified peptides are used to score the proteins for + - coverage + - percent of verified v. total peptide association + - Final user output + - SQLite database + - Protein score text file, comma delimited + - Filtered protein FASTA file \ No newline at end of file diff --git a/fastg2protlib/__init__.py b/fastg2protlib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/fastg2protlib/db.py b/fastg2protlib/db.py new file mode 100644 index 0000000..f1177b2 --- /dev/null +++ b/fastg2protlib/db.py @@ -0,0 +1,211 @@ +import logging +import os +import sqlite3 + + +class DBManager: + def __init__(self, name="app.db"): + self.database_name = name + self.conn = sqlite3.connect(self.database_name) + self.cursor = self.conn.cursor() + self.logger = logging.getLogger(__name__) + self.logger.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) + handler = logging.StreamHandler() + formatter = logging.Formatter( + "%(asctime)s %(name)-12s %(levelname)-8s %(message)s" + ) + handler.setFormatter(formatter) + self.logger.addHandler(handler) + # Build the tables + self.cursor.execute( + """CREATE TABLE IF NOT EXISTS walk + (id INTEGER PRIMARY KEY, walk TEXT, dna_sequence TEXT) + """ + ) + + self.cursor.execute( + """CREATE TABLE IF NOT EXISTS protein + (id INTEGER PRIMARY KEY, sequence TEXT)""" + ) + + self.cursor.execute( + """CREATE TABLE IF NOT EXISTS peptide + (id INTEGER PRIMARY KEY, sequence TEXT)""" + ) + + self.cursor.execute( + """CREATE TABLE IF NOT EXISTS protein_to_walk + (walk_id INTEGER, protein_id INTEGER)""" + ) + + self.cursor.execute( + """CREATE TABLE IF NOT EXISTS peptide_to_protein + (peptide_id INTEGER, protein_id INTEGER)""" + ) + + self.conn.commit() + + def exec_query(self, query): + return self.cursor.execute(query).fetchall() + + def d_quote(self, str): + return '"' + str + '"' + + def link_proteins_peptides(self, prot_peps): + """ + Populate many-to-many table linking peptides to proteins + prot_pep object: + { + "prot_id": int, + "peptides": set() + } + """ + self.logger.info("Begin link_proteins_peptides") + sql = sqlite3.connect(self.database_name) + sql.isolation_level = None + c = sql.cursor() + c.execute("begin") + try: + for obj in prot_peps: + protein_id = obj["prot_id"] + pep_seqs = [] + for pep_seq in obj["peptides"]: + pep_seqs.append(pep_seq) + in_clause = "(" + ",".join(list(map(self.d_quote, pep_seqs))) + ")" + sql_q = f"SELECT id FROM peptide WHERE sequence IN {in_clause}" + pep_ids = c.execute(sql_q).fetchall() + ins_data = [] + for pep_id in pep_ids: + ins_data.append((pep_id[0], protein_id)) + sql_ins = f"INSERT INTO peptide_to_protein(peptide_id, protein_id) VALUES (?, ?)" + c.executemany(sql_ins, ins_data) + c.execute("commit") + except sql.Error: + self.logger.error( + "Transaction failed, rolling back in link_proteins_peptides" + ) + c.execute("rollback") + self.logger.info("End link_proteins_peptides") + + def insert_peptides(self, prot_peps): + """{ + "prot_id": int, + "peptides": set() + } + """ + self.logger.info(f"Begin Insert peptides") + sql = sqlite3.connect(self.database_name) + sql.isolation_level = None + c = sql.cursor() + c.execute("begin") + + def data_iter(data): + """Filter out duplicate sequences + """ + seen = set() + for item in data: + if item in seen: + pass + else: + seen.add(item) + yield item + + try: + pep_seqs = [] + for pp in prot_peps: + for p in pp["peptides"]: + pep_seqs.append((p,)) + c.executemany( + "INSERT INTO peptide(sequence) VALUES (?)", data_iter(pep_seqs) + ) + c.execute("commit") + except sql.Error: + self.logger.error("Transaction failed, rolling back in insert_peptides") + c.execute("rollback") + # Add index + self.logger.info("Creating peptide_sequence index") + c.execute("CREATE INDEX peptide_sequence ON peptide(sequence)") + self.link_proteins_peptides(prot_peps) + self.logger.info(f"End Insert peptides") + + def build_links(self, objects): + """ + Populate many-to-many for proteins to walk traversals + """ + self.logger.info(f"Begin build_links") + sql = sqlite3.connect(self.database_name) + sql.isolation_level = None + c = sql.cursor() + c.execute("begin") + try: + for object in objects: + walk = ",".join(object["walk"]) + w_s = f'select id from walk where walk = "{walk}"' + w_id = c.execute(w_s).fetchone() + prot_seqs = [] + for p in object["proteins"]: + prot_seqs.append(p) + in_clause = "(" + ",".join(list(map(self.d_quote, prot_seqs))) + ")" + sql_q = f"SELECT id FROM protein WHERE sequence IN {in_clause}" + prot_ids = c.execute(sql_q).fetchall() + ins_data = [] + for prot_id in prot_ids: + ins_data.append((w_id[0], prot_id[0])) + sql_ins = ( + "INSERT INTO protein_to_walk(walk_id, protein_id) VALUES (?,?)" + ) + c.executemany(sql_ins, ins_data) + c.execute("commit") + except sql.Error: + self.logger.error("Transaction failed, rolling back in build_links") + c.execute("rollback") + self.logger.info(f"End build_links") + + def insert_protein_walk_objects(self, objects): + self.logger.info(f"Begin insert_protein_walk_objects") + sql = sqlite3.connect(self.database_name) + sql.isolation_level = None + c = sql.cursor() + c.execute("begin") + + def protein_sequence_iter(objects): + for object in objects: + for p in object["proteins"]: + yield (p,) + + def walk_sequence_iter(objects): + for object in objects: + walk = ",".join(object["walk"]) + seq = object["dna_sequence"] + yield (walk, seq) + + try: + c.executemany( + "INSERT INTO protein(sequence) VALUES (?)", + protein_sequence_iter(objects), + ) + c.executemany( + "INSERT INTO walk (walk, dna_sequence) VALUES (?,?)", + walk_sequence_iter(objects), + ) + c.execute("commit") + except sql.Error: + self.logger.error( + "Transaction failed, rolling back in insert_protein_walk_objects" + ) + c.execute("rollback") + # Create indexes + self.logger.info("Creating indexes: walks_walk, walks_w_d and protein_sequence") + c.execute("CREATE INDEX walks_walk ON walk(walk)") + c.execute("CREATE INDEX protein_sequence ON protein(sequence)") + c.execute("CREATE INDEX walks_w_d ON walk(walk, dna_sequence)") + + self.build_links(objects) + self.logger.info(f"End insert_protein_walk_objects") + + def get_protein_sequences(self): + self.logger.info(f"Begin get_protein_sequences") + c = self.conn.execute("select id, sequence from protein") + r_val = [x for x in c] + self.logger.info(f"End get_protein_sequences") + return r_val diff --git a/fastg2protlib/diagnostics.py b/fastg2protlib/diagnostics.py new file mode 100644 index 0000000..d42605d --- /dev/null +++ b/fastg2protlib/diagnostics.py @@ -0,0 +1,70 @@ +from collections import defaultdict + +import matplotlib.pyplot as plt +import numpy as np + +from Bio.SeqUtils import GC + + +def _sequence_histogram(seq_lengths, title, file_name): + plt.hist(seq_lengths, 20, facecolor="blue", alpha=0.5) + plt.xlabel("Length") + plt.ylabel("Count") + plt.title(title) + plt.savefig(file_name) + plt.clf() + + +def _gc_plot(sequences, title, file_name): + gc_values = sorted(GC(seq) for seq in sequences) + plt.plot(gc_values) + plt.title(title) + plt.xlabel("ORFs") + plt.ylabel("GC%") + plt.savefig(file_name) + plt.clf() + + +def _aa_barchart(prot_sequences): + aa_count = defaultdict(int) + for sequence in prot_sequences: + for aa in sequence: + aa_count[aa] += 1 + + objects = [*aa_count] + objects.sort() + y_pos = np.arange(len(objects)) + performance = [] + for aa in objects: + performance.append(aa_count[aa]) + + plt.bar(y_pos, performance, align="center", facecolor="blue", alpha=0.5) + plt.xticks(y_pos, objects) + plt.ylabel("Count") + plt.title("Amino Acid Count for Translated Proteins") + plt.savefig("aa_count_chart.png") + plt.clf() + + +def generate_diagnostic_plots(db_manager): + dna_results = db_manager.exec_query("select dna_sequence from walk") + + dna_lengths = map(lambda x: len(x[0]), dna_results) + _sequence_histogram( + list(dna_lengths), "DNA Sequence Lengths from FASTG", "fastg_seq_lengths.png" + ) + + protein_results = db_manager.exec_query("select sequence from protein") + protein_lengths = map(lambda x: len(x[0]), protein_results) + _sequence_histogram( + list(protein_lengths), "Protein Sequence Lengths", "protein_seq_lengths.png" + ) + + _gc_plot( + list(map(lambda x: x[0], dna_results)), + "GC Pct. - Translated FASTG", + "gc_pct.png", + ) + + # bar chart of all translated protein amino acids + _aa_barchart(list(map(lambda x: x[0], protein_results))) diff --git a/fastg2protlib/fastg2protlib.py b/fastg2protlib/fastg2protlib.py new file mode 100644 index 0000000..3434172 --- /dev/null +++ b/fastg2protlib/fastg2protlib.py @@ -0,0 +1,423 @@ +from Bio.Seq import Seq +from collections import defaultdict +import datetime +import logging +import networkx as nx +import os +from pyteomics import parser +import re +import sys + +from fastg2protlib.diagnostics import generate_diagnostic_plots +from fastg2protlib.db import DBManager +from fastg2protlib.protein_score import score_proteins, score_tabular + +logger = logging.getLogger(__name__) +logger.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) +handler = logging.StreamHandler() +formatter = logging.Formatter("%(asctime)s %(name)-12s %(levelname)-8s %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + + +def _parse_edges(edge_name, edge_str): + """ + EDGE_71_length_1904_cov_38.3193,EDGE_72_length_57_cov_29.75 + + becomes [(node_name, '71+'), ('71+', '72+')] + :param edge_name: + :param edge_str: + :return: list of tuples + """ + r_val = [] + + all_edges = edge_str.split(",") + leading_edge = edge_name + for e in all_edges: + rc = False + if e.endswith("'"): + rc = True + regex = re.compile(r"(?:EDGE|NODE)_(?P[a-zA-Z\d]+?)_length_") + m = regex.search(e) + node_name = m.group("node_name") + if rc: + node_name += "-" + else: + node_name += "+" + r_val.append((leading_edge, node_name)) + leading_edge = node_name + return r_val + + +def _parse_fastg_header(header): + # region + """ + Parse FASTG header and return + * node_name: '1+' + * length: 449 + * coverage: 82.5991 + * edges: [('1+', '71+'), ('71+', '72+')] + * sequence: None + * warnings: [list of warning message, if any] + + >EDGE_1_length_449_cov_82.5991:EDGE_71_length_1904_cov_38.3193,EDGE_72_length_57_cov_29.75; + + :param header: + :return: + """ + # endregion + keys = ["node_name", "length", "coverage", "edges", "sequence", "warnings"] + obj = dict.fromkeys(keys) + + header = header.lstrip(">").rstrip(";") + num_colons = sum([1 for x in header if x == ":"]) + if num_colons > 1: + obj["warnings"] = ["Too many colons in header, not in FASTG file format"] + + rev_compl = False + chunks = header.split(":") + + if chunks[0].endswith("'"): + rev_compl = True + regex = re.compile( + r"(?:EDGE|NODE)_(?P[a-zA-Z\d]+?)_length_(?P\d+?)_cov_(?P[\d|\.]+)" + ) + m = regex.search(chunks[0]) + if m is None: + obj["warnings"].append("Regex parsing failed") + node_name = m.group("node") + if rev_compl: + node_name += "-" + else: + node_name += "+" + obj["node_name"] = node_name + obj["length"] = m.group("length") + obj["coverage"] = m.group("cov") + if len(chunks) > 1: + obj["edges"] = _parse_edges(node_name, chunks[1]) + else: + obj["edges"] = None + obj["sequence"] = "" + + return obj + + +def _expand_walk(walks): + # region + """ + + We expand existing depth-first traversals here. + A node with multiple predecessors has multiple full length traversals. + This function splices together existing walks where a node is traversed more than once. + + Parameters + ---------- + walks: list of DFS edge traversals + + Returns + ------- + expanded_walks: list of DFS edge traversals expanded with longer paths. + """ + # endregion + expanded_walks = [] + walk_starts = {} + for i, walk in enumerate(walks): + walk_starts[walk[0]] = i + + for walk in walks: + path = [] + for i, node in enumerate(walk): + if node in walk_starts and i > 0: + if node != walk[0]: + p = path + p.extend(walks[walk_starts[node]]) + expanded_walks.append(p) + path = [] + path.append(node) + return expanded_walks + + +def _generate_sequence(G, graph_path): + """ + G.nodes['1+'] + :param graph_path: + :return: + """ + dna_seq = "" + for node_name in graph_path: + dna_seq += G.nodes[node_name]["sequence"] + return dna_seq + + +def _translate_dna(seq, min_length=150): + def frame_translation(sequence): + frame_sequences = [] + # translate for each frame + for frame_start in range(3): + frame_seq = sequence[frame_start:] + rem = len(frame_seq) % 3 + if rem == 1: + frame_seq += "NN" + if rem == 2: + frame_seq += "N" + + mrna = Seq(frame_seq) + t_seq = mrna.translate() + frame_sequences.append(f"{t_seq}") + return frame_sequences + + def filter_protein(p): + redundancy_pct = 0.80 + if len(p) < min_length: + return False + d = defaultdict(int) + for aa in p: + d[aa] += 1 + if max(d.values()) / len(p) > redundancy_pct: + return False + return True + + translated_sequences = frame_translation(seq) + rev_compl = Seq(seq).reverse_complement() + translated_sequences.extend(frame_translation(f"{rev_compl}")) + + return_list = [] + for a_sequence in translated_sequences: + return_list.extend(list(filter(filter_protein, a_sequence.split("*")))) + return return_list + + +def _extract_continuous_traversals(graph): + # region + """ + Traversal Extraction + -------------------- + Given a list of depth-first-search edges, find and extract all + continuous traversals. + + :param dfs_edge_list: a list of depth-first-search edges + :return: list of traversals + """ + # endregion + logger.info("Begin extracting traversals") + ret_val = [] + dfs_edge_list = nx.edge_dfs(graph) + walk = [] + for edge in dfs_edge_list: + if len(walk) == 0: + walk.extend([x for x in edge]) + elif edge[0] == walk[-1]: + walk.append(edge[1]) + else: + ret_val.append(walk) + walk = [] + walk.extend([x for x in edge]) + if len(walk) > 0: + ret_val.append(walk) + _expand_walk(ret_val) + logger.info("Finished extracting traversals") + return ret_val + + +def _add_node(graph, node): + graph.add_node( + node["node_name"], + length=node["length"], + coverage=node["coverage"], + sequence=node["sequence"], + ) + if node["edges"] is not None: + graph.add_edges_from(node["edges"]) + + +def _generate_graph(fastg_fname): + # region + """ + Generates a digraph from the FASTG file provided. + The FASTG file must be in SPADES format. + + :param fastg_fname: FASTG filename as a full path + :return: DiGraph based on FASTG edge/node data + """ + # endregion + g = nx.DiGraph() + with open(fastg_fname, "r") as f: + node = {} + for line in f: + line = line.strip() + if line.startswith(">"): + if len(node) > 0: + _add_node(g, node) + node = {} + node = _parse_fastg_header(line) + else: + node["sequence"] += line.strip() + _add_node(g, node) + return g + + +def _proteins_for_walk(graph, walk, min_length=150): + obj = { + "walk": walk, + "dna_sequence": _generate_sequence(graph, walk), + "proteins": [], + } + obj["proteins"] = _translate_dna(obj["dna_sequence"], min_length=min_length) + return obj + + +def _protein_digest(proteins, enz_cleavage="trypsin", min_length=10): + # (1, 'LTQQNKTIFRLLIAALIIL...EFQLMEAVE') <- p + prot_pep = [] + for p in proteins: + o = { + "prot_id": p[0], + "peptides": parser.cleave( + p[1], parser.expasy_rules[enz_cleavage], min_length=min_length + ), + } + prot_pep.append(o) + return prot_pep + + +def format_pep_head(item): + header = ">Pep_" + header += str(item[0]) + "|Protein_" + prots = item[1].split(",") + header += "_".join(prots) + return header + + +def write_peptide_fasta(db, fasta_name="peptide.fasta"): + # (1, '1,1,600') + logger.info("Executing select query for peptide writing.") + pep_prot = db.exec_query( + "SELECT peptide_id, GROUP_CONCAT(protein_id) FROM peptide_to_protein GROUP BY peptide_id" + ) + logger.info("Writing output") + with open(fasta_name, "w") as f: + for item in pep_prot: + s = db.exec_query("SELECT sequence FROM peptide WHERE id = " + str(item[0])) + f.write(format_pep_head(item)) + f.write(os.linesep) + f.write(s[0][0]) + f.write(os.linesep) + + +def peptides_for_fastg( + fastg_filename, + cleavage="trypsin", + min_protein_length=166, + min_peptide_length=10, + db_name="tst.db", + peptide_file_name="peptide.fasta", + create_plots=False +): + # region + """ + FASTG to Protein Library + ------------------------ + + This package generates a candidate protein library in two phases: + + 1) Parsing a FASTG file to create graph traversals of longer stretches of DNA + - FASTG is parsed into a directed graph. A depth-first search is made on all connecting + edges. The DFS traversal is then used to concatenate all DNA sequences in the path. + - DNA sequences are translated to mRNA and split into candidate proteins at the stop + codon. Each DFS traversal can, and will, produce a set of candidate protein sequences. + - Protein sequences are filtered on length and amino acid redundancy. + - Protein sequences are cleaved into peptide sequences. + - DFS traversals, proteins and peptides are stored in a SQLite database. The linking + relationship between all three is maintained in the DB. + - A FASTA file of peptides is produced for the user. This FASTA file is to be used + in a search against MSMS data. + 2) Using verified peptides as a filter to produce a final candidate peptide library + - The user will invoke the code with + - DB + - list of peptide sequences or peptide FASTA + - It is expected that the submitted peptides have been verified against MSMS and + they represent found and identified peptide sequences + - The verified peptides are used to filter proteins from the database, these + proteins become the final library. + - The verified peptides are used to score the proteins for + - coverage + - percent of verified v. total peptide association + - Final user output + - SQLite database + - Protein score text file, comma delimited + - Filtered protein FASTA file + + """ + # endregion + + logger.info(f"FASTG file: {fastg_filename}") + logger.info(f"Protein len: {min_protein_length}") + logger.info(f"Peptide len: {min_peptide_length}") + logger.info(f"Cleavage: {cleavage}") + logger.info(f"Database Name: {db_name}") + logger.info(f"============================================") + + logger.info("Begin _generate_graph") + g = _generate_graph(fastg_filename) + logger.info("End _generate_graph") + + walks = _extract_continuous_traversals(g) + protein_walk_objs = [] + + logger.info("Begin walks") + no_protein_count = 0 + for walk in walks: + obj = _proteins_for_walk(g, walk, min_length=min_protein_length) + if len(obj['proteins']) > 0: + protein_walk_objs.append(obj) + else: + no_protein_count += 1 + logger.info("End walks") + logger.info(f"There were {no_protein_count} walks with no protein sequences.") + + logger.info("Clearing graph object from memory") + g.clear() + + db = DBManager(db_name) + db.insert_protein_walk_objects(protein_walk_objs) + + proteins = db.get_protein_sequences() + db.insert_peptides( + _protein_digest(proteins, enz_cleavage=cleavage, min_length=min_peptide_length) + ) + + # Write peptide fasta + write_peptide_fasta(db, fasta_name=peptide_file_name) + + # diagnostics + if create_plots: + logger.info("Generating diagnostic plots") + generate_diagnostic_plots(db) + logger.info("Run is complete") + + +def verified_proteins( + tabular_file, fdr_level=0.10, decoy_header="XXX_", db_name="tst.db" +): + # region + """ + Purpose + ------- + User has searched peptides against MSMS data. Now, we will produce + a FASTA library of proteins based on the verified peptides from the search + + Input + ---------- + MSGF+ tabular output file. + + db_name: DB name used in production of peptide_file + + Output + ------- + A protein fasta file is written to disk + + A protein score text file is written to disk + + """ + # endregion + score_tabular(tabular_file, fdr_level, decoy_header, db_name) diff --git a/fastg2protlib/mzid_sax_parsing.py b/fastg2protlib/mzid_sax_parsing.py new file mode 100644 index 0000000..4ecec6c --- /dev/null +++ b/fastg2protlib/mzid_sax_parsing.py @@ -0,0 +1,63 @@ +import operator +import xml.sax + + +class msgfContentHandler(xml.sax.ContentHandler): + def __init__(self, score_config): + self.sii = dict() + self.in_sii = False + self.current_sii = None + self.peptides = dict() + self.in_peptide = False + self.current_peptide = None + self.in_pep_seq = False + self.comparison_operator = score_config["comparison"] + self.score_threshold = score_config["score_threshold"] + self.score_name = score_config["score_name"] + self.score_function = operator.attrgetter(score_config["comparison"]) + + def characters(self, chars): + if self.in_pep_seq: + self.peptides[self.current_peptide] = chars + + def startElement(self, name, attrs): + if name == "Peptide": + self.peptides[attrs["id"]] = None + self.in_peptide = True + self.current_peptide = attrs["id"] + + if name == "PeptideSequence" and self.in_peptide: + self.in_pep_seq = True + + if name == "SpectrumIdentificationItem": + if attrs["rank"] == "1": + self.in_sii = True + self.current_sii = {"peptide_ref": attrs["peptide_ref"]} + + if self.in_sii and name == "cvParam": + if attrs["name"] == self.score_name: + if self.score_function(operator)( + float(attrs["value"]), self.score_threshold + ): + self.current_sii[self.score_name] = attrs["value"] + + def endElement(self, name): + if name == "SpectrumIdentificationItem" and self.in_sii: + self.in_sii = False + self.sii[self.current_sii["peptide_ref"]] = self.current_sii + self.current_sii = None + if name == "PeptideSequence": + self.in_pep_seq = False + if name == "Peptide": + self.in_peptide = False + + +def parse_mzid(mzid_file, score_config): + verified_peptides = set() + parser = xml.sax.make_parser() + ch = msgfContentHandler(score_config) + parser.setContentHandler(ch) + parser.parse(mzid_file) + for peptide_ref in ch.sii.keys(): + verified_peptides.add(ch.peptides[peptide_ref]) + return verified_peptides diff --git a/fastg2protlib/protein_score.py b/fastg2protlib/protein_score.py new file mode 100644 index 0000000..911566f --- /dev/null +++ b/fastg2protlib/protein_score.py @@ -0,0 +1,248 @@ +import logging +import os +import operator +from pyteomics import parser, mzid +import re +import sqlite3 +import sys + +from fastg2protlib.mzid_sax_parsing import parse_mzid + +logger = logging.getLogger(__name__) +logger.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) +handler = logging.StreamHandler() +formatter = logging.Formatter("%(asctime)s %(name)-12s %(levelname)-8s %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + +session_db_name = None + + +class ProteinScore: + def __init__(self): + self.protein_id = None + self.pct_pep_coverage = None + self.pct_sequence_coverage = None + self.verified_peptides = None + + def __format__(self, formatstr): + return f"ID: {self.protein_id} Peptide Coverage by Count: {self.pct_pep_coverage} Protein Sequence Covered: {self.pct_sequence_coverage}" + + +def _query_db(sql): + global session_db_name + conn = sqlite3.connect(session_db_name) + cursor = conn.cursor() + rs = cursor.execute(sql) + return rs + + +def _proteins_for_peptide(pep_id): + """ + peptide_to_protein + (peptide_id INTEGER, protein_id INTEGER); + """ + rs = _query_db( + "SELECT protein_id FROM peptide_to_protein WHERE peptide_id = " + str(pep_id) + ) + proteins = [] + for r in rs.fetchall(): + proteins.append(r[0]) + return (pep_id, proteins) + + +def _pepseq_mapping(peptides): + """ + Uses a set of peptide sequences to return + a list of tuples: + (peptide id, list of protein ids the peptide sequence is part of) + """ + sql_in = ",".join(map('"{0}"'.format, peptides)) + + rs = _query_db("SELECT * FROM peptide WHERE peptide.sequence IN (" + sql_in + ")") + ret_val = [] + for r in rs.fetchall(): + ret_val.append(_proteins_for_peptide(r[0])) + return ret_val + + +def _protein_sequence_coverage(p_dict): + """ + Return a list of ProteinScore objects. Coverage is filled out. + """ + ret_val = [] + for k, v in p_dict.items(): + in_clause = "(" + ",".join(map(lambda x: str(x), v)) + ")" + pep_rs = _query_db("select sequence from peptide where id in " + in_clause) + pep_sequences = list(map(lambda x: x[0], pep_rs.fetchall())) + prot_rs = _query_db("select sequence from protein where id = " + str(k)) + prot_seq = prot_rs.fetchone()[0] + ps = ProteinScore() + ps.verified_peptides = v + ps.protein_id = k + ps.pct_sequence_coverage = round(parser.coverage(prot_seq, pep_sequences), 3) + ret_val.append(ps) + return ret_val + + +def _protein_dict(pep_lst): + # region + """ + Create a dictionary: + key = protein id + value = list of peptides associated with protein ) + + Dictionary gives us the raw count of unique peptide count for each protein. + + Parameters + ---------- + pep_lst + + Returns + ------- + dictionary + + """ + # endregion + protein_dict = dict() + for peptide_id, proteins in pep_lst: + for protein_id in proteins: + p_id = int(protein_id) + if p_id not in protein_dict: + protein_dict[p_id] = [] + protein_dict[p_id].append(int(peptide_id)) + return protein_dict + + +def _peptide_pct_coverage(lst): + # sqlite> select protein_id, GROUP_CONCAT(peptide_id) from peptide_to_protein where protein_id =323; + # 323|283,327,334,338,354,2305,2821,2822,2824,3070,3590,3591,3592,3593,3594,3693,3702,4023,6636 + for ps in lst: + sql = f"SELECT protein_id, GROUP_CONCAT(peptide_id) FROM peptide_to_protein WHERE protein_id = {ps.protein_id}" + rs = _query_db(sql) + ps.pct_pep_coverage = round( + len(ps.verified_peptides) / len(rs.fetchall()[0][1]), 3 + ) + + +def _read_mzid(mzid_file, score_config): + logger.info(f"Starting to read MZIdentML file {mzid_file}") + + verified_peptides = parse_mzid(mzid_file, score_config) + + logger.info(f"Finished with the MZIdentML file {mzid_file}") + return verified_peptides + + +def _write_protein_scores(p_scores): + with open("protein_scores.csv", "w") as f: + f.write("protein_id,pct_sequence_coverage,pct_peptide_coverage") + f.write(os.linesep) + for ps in p_scores: + f.write(f"{ps.protein_id},{ps.pct_sequence_coverage},{ps.pct_pep_coverage}") + f.write(os.linesep) + + +def _write_fasta(p_scores): + with open("protein.fasta", "w") as f: + for ps in p_scores: + s = f"SELECT sequence, start, stop, strand FROM protein WHERE id = {ps.protein_id}" + rs = _query_db(s) + results = rs.fetchone() + sequence = results[0] + start_offset = results[1] + stop_offset = results[2] + strand_id = results[3] + rs = _query_db( + f"SELECT walk FROM walk WHERE id IN (SELECT walk_id FROM protein_to_walk WHERE protein_id = {ps.protein_id})" + ) + f.write( + f">Protein_{ps.protein_id}|{rs.fetchone()[0]}_start:{start_offset}_stop:{stop_offset}_strand:{strand_id}" + ) + f.write(os.linesep) + f.write(f"{sequence}") + f.write(os.linesep) + + +def score_tabular(tabular_file, fdr_level, decoy_header, db_name): + # region + """ + Use tabular MSGF+ output to score the proteins via FDR. + """ + # endregion + global session_db_name + session_db_name = db_name + + fwd_id = 0 + rev_id = 0 + + peptide_sequences = set() + regex = re.compile(r"[^A-Za-z]") + + with open(tabular_file) as f: + header = f.readline().strip().split("\t") + for line in f: + values = line.strip().split("\t") + psm = dict(zip(header, values)) + if psm["Protein"].startswith(decoy_header): + rev_id += 1.0 + else: + fwd_id += 1.0 + if (float(rev_id) / fwd_id) > fdr_level: + break + else: + # Add peptide sequence, remove modification value + peptide_sequences.add(regex.sub("", psm["Peptide"])) + logger.info( + f"Generated {len(peptide_sequences)} unique peptide sequences at FDR <= {fdr_level}" + ) + proteins_to_peptides = _pepseq_mapping(peptide_sequences) + + protein_dict = _protein_dict(proteins_to_peptides) + protein_scores = _protein_sequence_coverage(protein_dict) + _peptide_pct_coverage(protein_scores) + _write_protein_scores(protein_scores) + _write_fasta(protein_scores) + logger.info("Completed scoring using MSGF+ tabular output") + + +""" +{ + '#SpecFile': 'T4A.mzML', 'SpecID': 'index=262484', 'ScanNum': '-1', + 'FragMethod': 'CID', 'Precursor': '1111.6', 'IsotopeError': '1', + 'PrecursorError(ppm)': '5.181823', 'Charge': '3', 'Peptide': 'FDIPIIQLQPPFVVDLLDSNIAVFGAAM+15.995SGK', + 'Protein': 'Pep_1590|Protein_133(pre=-,post=-)', 'DeNovoScore': '160', 'MSGFScore': '-61', + 'SpecEValue': '0.025976893', 'EValue': '6830.0264', 'QValue': '0.4996623', 'PepQValue': '0.80702245'} +""" + + +def score_proteins(mzid_file, score_object, db_name="tst.db"): + # region + """ + Purpose + ------- + User has searched peptides against MSMS data. Now, we will produce + a FASTA library of proteins based on the verified peptides from the search + + Inputs + ------ + MzIdentML file. + + Return + ------ + Protein FASTA file. + + """ + # endregion + global session_db_name + session_db_name = db_name + + verified_peptides = _read_mzid(mzid_file, score_object) + proteins_to_peptides = _pepseq_mapping(verified_peptides) + + protein_dict = _protein_dict(proteins_to_peptides) + protein_scores = _protein_sequence_coverage(protein_dict) + _peptide_pct_coverage(protein_scores) + _write_protein_scores(protein_scores) + _write_fasta(protein_scores) + logger.info("Completed scoring") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..08a15fc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,27 @@ +appdirs==1.4.3 +attrs==19.3.0 +biopython==1.76 +black==19.10b0 +click==7.1.1 +cycler==0.10.0 +decorator==4.4.2 +kiwisolver==1.2.0 +lxml==4.5.0 +matplotlib==3.2.2 +more-itertools==8.4.0 +networkx==2.4 +numpy==1.18.2 +packaging==20.4 +pathspec==0.7.0 +pluggy==0.13.1 +py==1.9.0 +pyparsing==2.4.7 +pyteomics==4.2 +pytest==5.4.3 +pytest-datadir==1.3.1 +python-dateutil==2.8.1 +regex==2020.2.20 +six==1.15.0 +toml==0.10.0 +typed-ast==1.4.1 +wcwidth==0.2.5 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..4de0952 --- /dev/null +++ b/setup.py @@ -0,0 +1,28 @@ +import setuptools + +_pkg_version = "1.0.0" +_author = "Thomas McGowan" +_author_email = "mcgo0092@umn.edu" +_license = "MIT" +_repo = "https://github.com/galaxyproteomics/fastg2protlib" + + +setuptools.setup( + name="fastg2protlib", + version=f"{_pkg_version}", + packages=["fastg2protlib"], + install_requires=["biopython", "lxml", "networkx", "numpy", "pyteomics", "matplotlib"], + description="FASTG sequences to a protein library", + author=f"{_author}", + author_email=f"{_author_email}", + url="f{_repo}", + download_url=f"{_repo}/archive/v_{_pkg_version}.tar.gz", + license=f"{_license}", + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Intended Audience :: Science/Research", + ], + python_requires=">=3.6", +) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/two.fastg b/tests/data/two.fastg new file mode 100644 index 0000000..f29abe3 --- /dev/null +++ b/tests/data/two.fastg @@ -0,0 +1,40 @@ +>EDGE_1_length_84_cov_1.0:EDGE_3_length_84_cov_1.0; +CGTTATTCGCGCCCACTCTCCCATTTATCCGCGCAAGCGGATGCGATGCGATTGCCCGCTAAGATATTCTTACCATTCTCGACA +>EDGE_1_length_84_cov_1.0'; +TGTCGAGAATGGTAAGAATATCTTAGCGGGCAATCGCATCGCATCCGCTTGCGCGGATAAATGGGAGAGTGGGCGCGAATAACG +>EDGE_2_length_84_cov_1.0:EDGE_3_length_84_cov_1.0; +CTGGTCCTGTTGACTACAATGGGCCCAACTCAATCACAGCTCGAGCGCCTTGAATAACATACTCATCTCTATACATTCTCGACA +>EDGE_2_length_84_cov_1.0':EDGE_3_length_84_cov_1.0'; +TGTCGAGAATGTATAGAGATGAGTATGTTATTCAAGGCGCTCGAGCTGTGATTGAGTTGGGCCCATTGTAGTCAACAGGACCAG +>EDGE_3_length_84_cov_1.0:EDGE_2_length_84_cov_1.0,EDGE_4_length_84_cov_1.0; +CATTCTCGACATGCTGAGCTGAGACGGCGTCGATGCATAGCGGACTTTCGGTCAGTCGCAATTCCTCACGAGACTGGTCCTGTT +>EDGE_3_length_84_cov_1.0':EDGE_2_length_84_cov_1.0',EDGE_1_length_84_cov_1.0'; +AACAGGACCAGTCTCGTGAGGAATTGCGACTGACCGAAAGTCCGCTATGCATCGACGCCGTCTCAGCTCAGCATGTCGAGAATG +>EDGE_4_length_84_cov_1.0:EDGE_5_length_84_cov_1.0; +CTGGTCCTGTTACAGAGCTGGCGTACGCGTTGAACACTTCACAGATGATAGGGATTCGGGTAAAGAGCGTGTCATTGGGGGCTT +>EDGE_4_length_84_cov_1.0':EDGE_3_length_84_cov_1.0'; +AAGCCCCCAATGACACGCTCTTTACCCGAATCCCTATCATCTGTGAAGTGTTCAACGCGTACGCCAGCTCTGTAACAGGACCAG +>EDGE_5_length_84_cov_1.0; +ATTGGGGGCTTCATACATAGAGCAAGGGCGTCGAACGGTCGTGAAAGTCTTAGTACCGCACGTACCAACTTACTGAGGATATTG +>EDGE_5_length_84_cov_1.0':EDGE_4_length_84_cov_1.0',EDGE_6_length_84_cov_1.0'; +CAATATCCTCAGTAAGTTGGTACGTGCGGTACTAAGACTTTCACGACCGTTCGACGCCCTTGCTCTATGTATGAAGCCCCCAAT +>EDGE_6_length_84_cov_1.0:EDGE_5_length_84_cov_1.0; +AAGAGGCCGCCACCGTTTTAGGGGGGGAAGGTTGAAGATCTCCTCTTCTCATGACTGAACTCGCGAGGGCCGTATTGGGGGCTT +>EDGE_6_length_84_cov_1.0':EDGE_8_length_84_cov_1.0'; +AAGCCCCCAATACGGCCCTCGCGAGTTCAGTCATGAGAAGAGGAGATCTTCAACCTTCCCCCCCTAAAACGGTGGCGGCCTCTT +>EDGE_7_length_84_cov_1.0:EDGE_8_length_84_cov_1.0; +AAGAGGCCGCCAAAGAACAAAGGCTTACTGTGCGCAGAGGAACGCCCATTTAGCGGCTGGCGTTTTGAATCCTTTTAATATTGT +>EDGE_7_length_84_cov_1.0':EDGE_8_length_84_cov_1.0'; +ACAATATTAAAAGGATTCAAAACGCCAGCCGCTAAATGGGCGTTCCTCTGCGCACAGTAAGCCTTTGTTCTTTGGCGGCCTCTT +>EDGE_8_length_84_cov_1.0:EDGE_7_length_84_cov_1.0,EDGE_6_length_84_cov_1.0; +TTTAATATTGTTTAATCCAATTCCCTCATTTAGGACCCTACCAAGTCAACATTGGTATATGAATGCGACCTCGAAGAGGCCGCC +>EDGE_8_length_84_cov_1.0':EDGE_7_length_84_cov_1.0',EDGE_9_length_84_cov_1.0'; +GGCGGCCTCTTCGAGGTCGCATTCATATACCAATGTTGACTTGGTAGGGTCCTAAATGAGGGAATTGGATTAAACAATATTAAA +>EDGE_9_length_84_cov_1.0:EDGE_8_length_84_cov_1.0; +TAAAAATGACAGTGGTTGGTGCTCTAAACTTCATTTGGTTAACTCGTGTATCAGCGCGATAGGCTGTTAGAGGTTTAATATTGT +>EDGE_9_length_84_cov_1.0'; +ACAATATTAAACCTCTAACAGCCTATCGCGCTGATACACGAGTTAACCAAATGAAGTTTAGAGCACCAACCACTGTCATTTTTA +>EDGE_10_length_84_cov_1.0; +ATGGCAAGGTACTTCCGGTCTTAATGAATGGCCGGGAAAGGTACGCACGCGGTATGGGGGGGTGAAGGGGCGAATAGACAGGCT +>EDGE_10_length_84_cov_1.0':EDGE_10_length_84_cov_1.0; +AGCCTGTCTATTCGCCCCTTCACCCCCCCATACCGCGTGCGTACCTTTCCCGGCCATTCATTAAGACCGGAAGTACCTTGCCAT diff --git a/tests/test_basic.py b/tests/test_basic.py new file mode 100644 index 0000000..6e9c068 --- /dev/null +++ b/tests/test_basic.py @@ -0,0 +1,18 @@ +from fastg2protlib import fastg2protlib +import os + + +def test_peptides_for_fastg(shared_datadir, tmpdir): + contents = (shared_datadir / 'two.fastg').read_text() + sub = tmpdir.mkdir("sub") + p = sub.join("fastg_input") + p.write(contents) + + fastg2protlib.peptides_for_fastg( + os.path.join(sub, "fastg_input"), + min_protein_length=40, + db_name=os.path.join(sub, "test.db"), + peptide_file_name=os.path.join(sub, "peptide.fasta")) + with open(os.path.join(sub, "peptide.fasta")) as f: + peps = f.readlines() + assert len(peps) == 68 \ No newline at end of file