Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
tmcgowan committed Jul 21, 2020
0 parents commit 6470d4a
Show file tree
Hide file tree
Showing 14 changed files with 1,199 additions and 0 deletions.
20 changes: 20 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
dist/
build/
fastg2protlib.egg-info/

venv/
.vscode/
.pytest_cache/
fastg2protlib/__pycache__/
tests/__pycache__/
.idea/
*.db
*.csv
*.fasta
*.mzid
notes.txt
run_*
*.tabular

.DS_Store
application.py
17 changes: 17 additions & 0 deletions LICENSE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
MIT License
Copyright (c) 2018 YOUR NAME
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
34 changes: 34 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# FASTG to Protein Library


This package generates a candidate protein library in two phases:

1) Parsing a FASTG file to create graph traversals of longer stretches of DNA

- FASTG is parsed into a directed graph. A depth-first search is made on all connecting
edges. The DFS traversal is then used to concatenate all DNA sequences in the path.
- DNA sequences are translated to mRNA and split into candidate proteins at the stop
codon. Each DFS traversal can, and will, produce a set of candidate protein sequences.
- Protein sequences are filtered on length and amino acid redundancy.
- Protein sequences are cleaved into peptide sequences.
- DFS traversals, proteins and peptides are stored in a SQLite database. The linking
relationship between all three is maintained in the DB.
- A FASTA file of peptides is produced for the user. This FASTA file is to be used
in a search against MSMS data.

2) Using verified peptides as a filter to produce a final candidate peptide library

- The user will invoke the code with
- DB
- list of peptide sequences or peptide FASTA
- It is expected that the submitted peptides have been verified against MSMS and
they represent found and identified peptide sequences
- The verified peptides are used to filter proteins from the database, these
proteins become the final library.
- The verified peptides are used to score the proteins for
- coverage
- percent of verified v. total peptide association
- Final user output
- SQLite database
- Protein score text file, comma delimited
- Filtered protein FASTA file
Empty file added fastg2protlib/__init__.py
Empty file.
211 changes: 211 additions & 0 deletions fastg2protlib/db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
import logging
import os
import sqlite3


class DBManager:
def __init__(self, name="app.db"):
self.database_name = name
self.conn = sqlite3.connect(self.database_name)
self.cursor = self.conn.cursor()
self.logger = logging.getLogger(__name__)
self.logger.setLevel(level=os.environ.get("LOGLEVEL", "INFO"))
handler = logging.StreamHandler()
formatter = logging.Formatter(
"%(asctime)s %(name)-12s %(levelname)-8s %(message)s"
)
handler.setFormatter(formatter)
self.logger.addHandler(handler)
# Build the tables
self.cursor.execute(
"""CREATE TABLE IF NOT EXISTS walk
(id INTEGER PRIMARY KEY, walk TEXT, dna_sequence TEXT)
"""
)

self.cursor.execute(
"""CREATE TABLE IF NOT EXISTS protein
(id INTEGER PRIMARY KEY, sequence TEXT)"""
)

self.cursor.execute(
"""CREATE TABLE IF NOT EXISTS peptide
(id INTEGER PRIMARY KEY, sequence TEXT)"""
)

self.cursor.execute(
"""CREATE TABLE IF NOT EXISTS protein_to_walk
(walk_id INTEGER, protein_id INTEGER)"""
)

self.cursor.execute(
"""CREATE TABLE IF NOT EXISTS peptide_to_protein
(peptide_id INTEGER, protein_id INTEGER)"""
)

self.conn.commit()

def exec_query(self, query):
return self.cursor.execute(query).fetchall()

def d_quote(self, str):
return '"' + str + '"'

def link_proteins_peptides(self, prot_peps):
"""
Populate many-to-many table linking peptides to proteins
prot_pep object:
{
"prot_id": int,
"peptides": set()
}
"""
self.logger.info("Begin link_proteins_peptides")
sql = sqlite3.connect(self.database_name)
sql.isolation_level = None
c = sql.cursor()
c.execute("begin")
try:
for obj in prot_peps:
protein_id = obj["prot_id"]
pep_seqs = []
for pep_seq in obj["peptides"]:
pep_seqs.append(pep_seq)
in_clause = "(" + ",".join(list(map(self.d_quote, pep_seqs))) + ")"
sql_q = f"SELECT id FROM peptide WHERE sequence IN {in_clause}"
pep_ids = c.execute(sql_q).fetchall()
ins_data = []
for pep_id in pep_ids:
ins_data.append((pep_id[0], protein_id))
sql_ins = f"INSERT INTO peptide_to_protein(peptide_id, protein_id) VALUES (?, ?)"
c.executemany(sql_ins, ins_data)
c.execute("commit")
except sql.Error:
self.logger.error(
"Transaction failed, rolling back in link_proteins_peptides"
)
c.execute("rollback")
self.logger.info("End link_proteins_peptides")

def insert_peptides(self, prot_peps):
"""{
"prot_id": int,
"peptides": set()
}
"""
self.logger.info(f"Begin Insert peptides")
sql = sqlite3.connect(self.database_name)
sql.isolation_level = None
c = sql.cursor()
c.execute("begin")

def data_iter(data):
"""Filter out duplicate sequences
"""
seen = set()
for item in data:
if item in seen:
pass
else:
seen.add(item)
yield item

try:
pep_seqs = []
for pp in prot_peps:
for p in pp["peptides"]:
pep_seqs.append((p,))
c.executemany(
"INSERT INTO peptide(sequence) VALUES (?)", data_iter(pep_seqs)
)
c.execute("commit")
except sql.Error:
self.logger.error("Transaction failed, rolling back in insert_peptides")
c.execute("rollback")
# Add index
self.logger.info("Creating peptide_sequence index")
c.execute("CREATE INDEX peptide_sequence ON peptide(sequence)")
self.link_proteins_peptides(prot_peps)
self.logger.info(f"End Insert peptides")

def build_links(self, objects):
"""
Populate many-to-many for proteins to walk traversals
"""
self.logger.info(f"Begin build_links")
sql = sqlite3.connect(self.database_name)
sql.isolation_level = None
c = sql.cursor()
c.execute("begin")
try:
for object in objects:
walk = ",".join(object["walk"])
w_s = f'select id from walk where walk = "{walk}"'
w_id = c.execute(w_s).fetchone()
prot_seqs = []
for p in object["proteins"]:
prot_seqs.append(p)
in_clause = "(" + ",".join(list(map(self.d_quote, prot_seqs))) + ")"
sql_q = f"SELECT id FROM protein WHERE sequence IN {in_clause}"
prot_ids = c.execute(sql_q).fetchall()
ins_data = []
for prot_id in prot_ids:
ins_data.append((w_id[0], prot_id[0]))
sql_ins = (
"INSERT INTO protein_to_walk(walk_id, protein_id) VALUES (?,?)"
)
c.executemany(sql_ins, ins_data)
c.execute("commit")
except sql.Error:
self.logger.error("Transaction failed, rolling back in build_links")
c.execute("rollback")
self.logger.info(f"End build_links")

def insert_protein_walk_objects(self, objects):
self.logger.info(f"Begin insert_protein_walk_objects")
sql = sqlite3.connect(self.database_name)
sql.isolation_level = None
c = sql.cursor()
c.execute("begin")

def protein_sequence_iter(objects):
for object in objects:
for p in object["proteins"]:
yield (p,)

def walk_sequence_iter(objects):
for object in objects:
walk = ",".join(object["walk"])
seq = object["dna_sequence"]
yield (walk, seq)

try:
c.executemany(
"INSERT INTO protein(sequence) VALUES (?)",
protein_sequence_iter(objects),
)
c.executemany(
"INSERT INTO walk (walk, dna_sequence) VALUES (?,?)",
walk_sequence_iter(objects),
)
c.execute("commit")
except sql.Error:
self.logger.error(
"Transaction failed, rolling back in insert_protein_walk_objects"
)
c.execute("rollback")
# Create indexes
self.logger.info("Creating indexes: walks_walk, walks_w_d and protein_sequence")
c.execute("CREATE INDEX walks_walk ON walk(walk)")
c.execute("CREATE INDEX protein_sequence ON protein(sequence)")
c.execute("CREATE INDEX walks_w_d ON walk(walk, dna_sequence)")

self.build_links(objects)
self.logger.info(f"End insert_protein_walk_objects")

def get_protein_sequences(self):
self.logger.info(f"Begin get_protein_sequences")
c = self.conn.execute("select id, sequence from protein")
r_val = [x for x in c]
self.logger.info(f"End get_protein_sequences")
return r_val
70 changes: 70 additions & 0 deletions fastg2protlib/diagnostics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np

from Bio.SeqUtils import GC


def _sequence_histogram(seq_lengths, title, file_name):
plt.hist(seq_lengths, 20, facecolor="blue", alpha=0.5)
plt.xlabel("Length")
plt.ylabel("Count")
plt.title(title)
plt.savefig(file_name)
plt.clf()


def _gc_plot(sequences, title, file_name):
gc_values = sorted(GC(seq) for seq in sequences)
plt.plot(gc_values)
plt.title(title)
plt.xlabel("ORFs")
plt.ylabel("GC%")
plt.savefig(file_name)
plt.clf()


def _aa_barchart(prot_sequences):
aa_count = defaultdict(int)
for sequence in prot_sequences:
for aa in sequence:
aa_count[aa] += 1

objects = [*aa_count]
objects.sort()
y_pos = np.arange(len(objects))
performance = []
for aa in objects:
performance.append(aa_count[aa])

plt.bar(y_pos, performance, align="center", facecolor="blue", alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel("Count")
plt.title("Amino Acid Count for Translated Proteins")
plt.savefig("aa_count_chart.png")
plt.clf()


def generate_diagnostic_plots(db_manager):
dna_results = db_manager.exec_query("select dna_sequence from walk")

dna_lengths = map(lambda x: len(x[0]), dna_results)
_sequence_histogram(
list(dna_lengths), "DNA Sequence Lengths from FASTG", "fastg_seq_lengths.png"
)

protein_results = db_manager.exec_query("select sequence from protein")
protein_lengths = map(lambda x: len(x[0]), protein_results)
_sequence_histogram(
list(protein_lengths), "Protein Sequence Lengths", "protein_seq_lengths.png"
)

_gc_plot(
list(map(lambda x: x[0], dna_results)),
"GC Pct. - Translated FASTG",
"gc_pct.png",
)

# bar chart of all translated protein amino acids
_aa_barchart(list(map(lambda x: x[0], protein_results)))
Loading

0 comments on commit 6470d4a

Please sign in to comment.