-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 6470d4a
Showing
14 changed files
with
1,199 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
dist/ | ||
build/ | ||
fastg2protlib.egg-info/ | ||
|
||
venv/ | ||
.vscode/ | ||
.pytest_cache/ | ||
fastg2protlib/__pycache__/ | ||
tests/__pycache__/ | ||
.idea/ | ||
*.db | ||
*.csv | ||
*.fasta | ||
*.mzid | ||
notes.txt | ||
run_* | ||
*.tabular | ||
|
||
.DS_Store | ||
application.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
MIT License | ||
Copyright (c) 2018 YOUR NAME | ||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# FASTG to Protein Library | ||
|
||
|
||
This package generates a candidate protein library in two phases: | ||
|
||
1) Parsing a FASTG file to create graph traversals of longer stretches of DNA | ||
|
||
- FASTG is parsed into a directed graph. A depth-first search is made on all connecting | ||
edges. The DFS traversal is then used to concatenate all DNA sequences in the path. | ||
- DNA sequences are translated to mRNA and split into candidate proteins at the stop | ||
codon. Each DFS traversal can, and will, produce a set of candidate protein sequences. | ||
- Protein sequences are filtered on length and amino acid redundancy. | ||
- Protein sequences are cleaved into peptide sequences. | ||
- DFS traversals, proteins and peptides are stored in a SQLite database. The linking | ||
relationship between all three is maintained in the DB. | ||
- A FASTA file of peptides is produced for the user. This FASTA file is to be used | ||
in a search against MSMS data. | ||
|
||
2) Using verified peptides as a filter to produce a final candidate peptide library | ||
|
||
- The user will invoke the code with | ||
- DB | ||
- list of peptide sequences or peptide FASTA | ||
- It is expected that the submitted peptides have been verified against MSMS and | ||
they represent found and identified peptide sequences | ||
- The verified peptides are used to filter proteins from the database, these | ||
proteins become the final library. | ||
- The verified peptides are used to score the proteins for | ||
- coverage | ||
- percent of verified v. total peptide association | ||
- Final user output | ||
- SQLite database | ||
- Protein score text file, comma delimited | ||
- Filtered protein FASTA file |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,211 @@ | ||
import logging | ||
import os | ||
import sqlite3 | ||
|
||
|
||
class DBManager: | ||
def __init__(self, name="app.db"): | ||
self.database_name = name | ||
self.conn = sqlite3.connect(self.database_name) | ||
self.cursor = self.conn.cursor() | ||
self.logger = logging.getLogger(__name__) | ||
self.logger.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) | ||
handler = logging.StreamHandler() | ||
formatter = logging.Formatter( | ||
"%(asctime)s %(name)-12s %(levelname)-8s %(message)s" | ||
) | ||
handler.setFormatter(formatter) | ||
self.logger.addHandler(handler) | ||
# Build the tables | ||
self.cursor.execute( | ||
"""CREATE TABLE IF NOT EXISTS walk | ||
(id INTEGER PRIMARY KEY, walk TEXT, dna_sequence TEXT) | ||
""" | ||
) | ||
|
||
self.cursor.execute( | ||
"""CREATE TABLE IF NOT EXISTS protein | ||
(id INTEGER PRIMARY KEY, sequence TEXT)""" | ||
) | ||
|
||
self.cursor.execute( | ||
"""CREATE TABLE IF NOT EXISTS peptide | ||
(id INTEGER PRIMARY KEY, sequence TEXT)""" | ||
) | ||
|
||
self.cursor.execute( | ||
"""CREATE TABLE IF NOT EXISTS protein_to_walk | ||
(walk_id INTEGER, protein_id INTEGER)""" | ||
) | ||
|
||
self.cursor.execute( | ||
"""CREATE TABLE IF NOT EXISTS peptide_to_protein | ||
(peptide_id INTEGER, protein_id INTEGER)""" | ||
) | ||
|
||
self.conn.commit() | ||
|
||
def exec_query(self, query): | ||
return self.cursor.execute(query).fetchall() | ||
|
||
def d_quote(self, str): | ||
return '"' + str + '"' | ||
|
||
def link_proteins_peptides(self, prot_peps): | ||
""" | ||
Populate many-to-many table linking peptides to proteins | ||
prot_pep object: | ||
{ | ||
"prot_id": int, | ||
"peptides": set() | ||
} | ||
""" | ||
self.logger.info("Begin link_proteins_peptides") | ||
sql = sqlite3.connect(self.database_name) | ||
sql.isolation_level = None | ||
c = sql.cursor() | ||
c.execute("begin") | ||
try: | ||
for obj in prot_peps: | ||
protein_id = obj["prot_id"] | ||
pep_seqs = [] | ||
for pep_seq in obj["peptides"]: | ||
pep_seqs.append(pep_seq) | ||
in_clause = "(" + ",".join(list(map(self.d_quote, pep_seqs))) + ")" | ||
sql_q = f"SELECT id FROM peptide WHERE sequence IN {in_clause}" | ||
pep_ids = c.execute(sql_q).fetchall() | ||
ins_data = [] | ||
for pep_id in pep_ids: | ||
ins_data.append((pep_id[0], protein_id)) | ||
sql_ins = f"INSERT INTO peptide_to_protein(peptide_id, protein_id) VALUES (?, ?)" | ||
c.executemany(sql_ins, ins_data) | ||
c.execute("commit") | ||
except sql.Error: | ||
self.logger.error( | ||
"Transaction failed, rolling back in link_proteins_peptides" | ||
) | ||
c.execute("rollback") | ||
self.logger.info("End link_proteins_peptides") | ||
|
||
def insert_peptides(self, prot_peps): | ||
"""{ | ||
"prot_id": int, | ||
"peptides": set() | ||
} | ||
""" | ||
self.logger.info(f"Begin Insert peptides") | ||
sql = sqlite3.connect(self.database_name) | ||
sql.isolation_level = None | ||
c = sql.cursor() | ||
c.execute("begin") | ||
|
||
def data_iter(data): | ||
"""Filter out duplicate sequences | ||
""" | ||
seen = set() | ||
for item in data: | ||
if item in seen: | ||
pass | ||
else: | ||
seen.add(item) | ||
yield item | ||
|
||
try: | ||
pep_seqs = [] | ||
for pp in prot_peps: | ||
for p in pp["peptides"]: | ||
pep_seqs.append((p,)) | ||
c.executemany( | ||
"INSERT INTO peptide(sequence) VALUES (?)", data_iter(pep_seqs) | ||
) | ||
c.execute("commit") | ||
except sql.Error: | ||
self.logger.error("Transaction failed, rolling back in insert_peptides") | ||
c.execute("rollback") | ||
# Add index | ||
self.logger.info("Creating peptide_sequence index") | ||
c.execute("CREATE INDEX peptide_sequence ON peptide(sequence)") | ||
self.link_proteins_peptides(prot_peps) | ||
self.logger.info(f"End Insert peptides") | ||
|
||
def build_links(self, objects): | ||
""" | ||
Populate many-to-many for proteins to walk traversals | ||
""" | ||
self.logger.info(f"Begin build_links") | ||
sql = sqlite3.connect(self.database_name) | ||
sql.isolation_level = None | ||
c = sql.cursor() | ||
c.execute("begin") | ||
try: | ||
for object in objects: | ||
walk = ",".join(object["walk"]) | ||
w_s = f'select id from walk where walk = "{walk}"' | ||
w_id = c.execute(w_s).fetchone() | ||
prot_seqs = [] | ||
for p in object["proteins"]: | ||
prot_seqs.append(p) | ||
in_clause = "(" + ",".join(list(map(self.d_quote, prot_seqs))) + ")" | ||
sql_q = f"SELECT id FROM protein WHERE sequence IN {in_clause}" | ||
prot_ids = c.execute(sql_q).fetchall() | ||
ins_data = [] | ||
for prot_id in prot_ids: | ||
ins_data.append((w_id[0], prot_id[0])) | ||
sql_ins = ( | ||
"INSERT INTO protein_to_walk(walk_id, protein_id) VALUES (?,?)" | ||
) | ||
c.executemany(sql_ins, ins_data) | ||
c.execute("commit") | ||
except sql.Error: | ||
self.logger.error("Transaction failed, rolling back in build_links") | ||
c.execute("rollback") | ||
self.logger.info(f"End build_links") | ||
|
||
def insert_protein_walk_objects(self, objects): | ||
self.logger.info(f"Begin insert_protein_walk_objects") | ||
sql = sqlite3.connect(self.database_name) | ||
sql.isolation_level = None | ||
c = sql.cursor() | ||
c.execute("begin") | ||
|
||
def protein_sequence_iter(objects): | ||
for object in objects: | ||
for p in object["proteins"]: | ||
yield (p,) | ||
|
||
def walk_sequence_iter(objects): | ||
for object in objects: | ||
walk = ",".join(object["walk"]) | ||
seq = object["dna_sequence"] | ||
yield (walk, seq) | ||
|
||
try: | ||
c.executemany( | ||
"INSERT INTO protein(sequence) VALUES (?)", | ||
protein_sequence_iter(objects), | ||
) | ||
c.executemany( | ||
"INSERT INTO walk (walk, dna_sequence) VALUES (?,?)", | ||
walk_sequence_iter(objects), | ||
) | ||
c.execute("commit") | ||
except sql.Error: | ||
self.logger.error( | ||
"Transaction failed, rolling back in insert_protein_walk_objects" | ||
) | ||
c.execute("rollback") | ||
# Create indexes | ||
self.logger.info("Creating indexes: walks_walk, walks_w_d and protein_sequence") | ||
c.execute("CREATE INDEX walks_walk ON walk(walk)") | ||
c.execute("CREATE INDEX protein_sequence ON protein(sequence)") | ||
c.execute("CREATE INDEX walks_w_d ON walk(walk, dna_sequence)") | ||
|
||
self.build_links(objects) | ||
self.logger.info(f"End insert_protein_walk_objects") | ||
|
||
def get_protein_sequences(self): | ||
self.logger.info(f"Begin get_protein_sequences") | ||
c = self.conn.execute("select id, sequence from protein") | ||
r_val = [x for x in c] | ||
self.logger.info(f"End get_protein_sequences") | ||
return r_val |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
from collections import defaultdict | ||
|
||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
|
||
from Bio.SeqUtils import GC | ||
|
||
|
||
def _sequence_histogram(seq_lengths, title, file_name): | ||
plt.hist(seq_lengths, 20, facecolor="blue", alpha=0.5) | ||
plt.xlabel("Length") | ||
plt.ylabel("Count") | ||
plt.title(title) | ||
plt.savefig(file_name) | ||
plt.clf() | ||
|
||
|
||
def _gc_plot(sequences, title, file_name): | ||
gc_values = sorted(GC(seq) for seq in sequences) | ||
plt.plot(gc_values) | ||
plt.title(title) | ||
plt.xlabel("ORFs") | ||
plt.ylabel("GC%") | ||
plt.savefig(file_name) | ||
plt.clf() | ||
|
||
|
||
def _aa_barchart(prot_sequences): | ||
aa_count = defaultdict(int) | ||
for sequence in prot_sequences: | ||
for aa in sequence: | ||
aa_count[aa] += 1 | ||
|
||
objects = [*aa_count] | ||
objects.sort() | ||
y_pos = np.arange(len(objects)) | ||
performance = [] | ||
for aa in objects: | ||
performance.append(aa_count[aa]) | ||
|
||
plt.bar(y_pos, performance, align="center", facecolor="blue", alpha=0.5) | ||
plt.xticks(y_pos, objects) | ||
plt.ylabel("Count") | ||
plt.title("Amino Acid Count for Translated Proteins") | ||
plt.savefig("aa_count_chart.png") | ||
plt.clf() | ||
|
||
|
||
def generate_diagnostic_plots(db_manager): | ||
dna_results = db_manager.exec_query("select dna_sequence from walk") | ||
|
||
dna_lengths = map(lambda x: len(x[0]), dna_results) | ||
_sequence_histogram( | ||
list(dna_lengths), "DNA Sequence Lengths from FASTG", "fastg_seq_lengths.png" | ||
) | ||
|
||
protein_results = db_manager.exec_query("select sequence from protein") | ||
protein_lengths = map(lambda x: len(x[0]), protein_results) | ||
_sequence_histogram( | ||
list(protein_lengths), "Protein Sequence Lengths", "protein_seq_lengths.png" | ||
) | ||
|
||
_gc_plot( | ||
list(map(lambda x: x[0], dna_results)), | ||
"GC Pct. - Translated FASTG", | ||
"gc_pct.png", | ||
) | ||
|
||
# bar chart of all translated protein amino acids | ||
_aa_barchart(list(map(lambda x: x[0], protein_results))) |
Oops, something went wrong.