-
Notifications
You must be signed in to change notification settings - Fork 0
/
motif_mark_code.py
311 lines (264 loc) · 11.6 KB
/
motif_mark_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
#!/bin/python
# Author: Wyatt Eng
# Date 2/17/20
# Goal: Parse a FASTA file that contains a gene (with introns and exons) and identify biological motifs. Motifs short sequences
# of DNA that is indicative of structural elements of proteins. The input of the software should be a FASTA file and
# a file containing the known motifs. The output should be a graphical version that identifies every motif location.
import cairo
import math
import argparse
import re
import os
from Bio import SeqIO
from matplotlib import cm
import random as random
import warnings
# Set input variables
def get_args():
parser = argparse.ArgumentParser(
description="Identify motifs from gene files and produce a figure showing each motif's location across exons and introns. \
Uppercase nucleotides should signify exons and Lowercase nucleotides should signify introns.")
parser.add_argument("-f", "--file_name", help="path to a fasta file containing gene sequences", type=str)
parser.add_argument("-m", "--motif_file_name", help="path to a file containing a list of the known motifs", type=str)
parser.add_argument("-o", "--out_file_png_name", help="Desired outfile path + name for the output png file", type=str)
parser.add_argument("-t", "--molecule_type", help="Molecule type of fasta (gene) file: RNA or DNA", type=str)
return parser.parse_args()
args = get_args()
# Setting user inputted variables
input_file = args.file_name
motif_file = args.motif_file_name
out_file = args.out_file_png_name
fasta_convert = args.molecule_type
##### !!!!!DELETE LATER!!!!! #####
motif_file = "Fig_1_motifs.txt"
input_file = "Figure_1.fasta"
out_file = "test.png"
fasta_convert = "dna"
##### !!!!!Remove if code does not work and you want to print warnings!!!!! #####
warnings.filterwarnings("ignore") # ignore all warnings because biopython seems to throw a warning
fasta_convert = fasta_convert.upper() # make molecule type all uppercase
##### Global Variables ######
# Dictionary to store every possible motif
motifDict = {}
# Dictionary to associate a number to a motif
motif_numb = {}
# List storing the motif lengths
length_motif = []
# List storing the differences in lengths of the motifs
uniq_diff_motif_len = []
# Dictionary for amiguous nucleotides
ambig_dict = {'U':['T'], 'W':['A', 'T'], 'S':['C','G'], 'M':['A','C'], 'K':['G','T'], 'R':['A','G'], 'Y':['C','T'], 'B':['C','G','T'], 'D':['A','G','T'], 'H':['A','C','T'], 'V':['A','C','G'], 'N':['A','C','G','T']}
class cMotif:
def __init__(self, mStart_pos, mStop_pos, mNumber):
self.mStart_pos = mStart_pos
self.mStop_pos = mStop_pos
self.mColor = mNumber
class cGene:
def __init__(self, mLength, mExonStart, mExonStop, mHeaderName, mMotifList, mGeneNum, mMotifDict):
self.mLength = mLength
self.mExonStart = mExonStart
self.mExonStop = mExonStop
self.mHeaderName = mHeaderName
self.mMotifList = mMotifList
self.mGeneNum = mGeneNum
self.mMotifDict = mMotifDict
def f_IterateMotif(original, sequence, index):
'''Function to iterate through every possible motif accounting for U and Y in the motif sequence. \
Dictionary key: different possibilities of motifs / value: given motif \
U (purines) = T or U / Y (pyrimidines) = C or T or U \
Recursive Function! Shoutout to Clio Skevington) \
Returns: Nothing, stores all possible motifs to dictionary'''
if index == len(original):
motifDict[sequence] = original
elif sequence[index] in ambig_dict.keys():
current_key = sequence[index]
for nuc in ambig_dict[current_key]:
seq_list = list(sequence)
seq_list[index] = nuc
sequence = "".join(seq_list)
f_IterateMotif(original, sequence, index + 1)
else:
f_IterateMotif(original, sequence, index + 1)
def f_SaveMotifs(motif_file_name):
'''Function to take in the motif file name and save known motifs in a dictionary. \
Dictionary key: different possibilities of motifs / value: given motif \
U (purines) = A or G / Y (pyrimidines) = C or T\
Returns: Nothing'''
index_val = 1
with open(motif_file_name) as mf:
for line in mf:
line = line.strip()
if line != "":
line = line.upper()
length_motif.append(len(line))
f_IterateMotif(line, line, 0)
motif_numb.setdefault(line, index_val)
index_val = index_val + 1
length_motif.sort(reverse=True)
shortest_motif = length_motif[-1]
diff_motif_len = []
for i in range(0, (len(length_motif)-1)):
diff_motif_len.append(abs(shortest_motif-length_motif[i]))
diff_motif_len.sort()
for x in diff_motif_len:
# check if exists in unique_list or not
if x not in uniq_diff_motif_len:
uniq_diff_motif_len.append(x)
def multi2linefasta(file):
'''Function converts multiline sequence fasta files to single line sequence fasta files \
Returns: Nothing'''
mfasta = "singleline.fasta"
infile = open(file,'r')
with open(mfasta, 'w') as ofile:
for record in SeqIO.parse(infile, "fasta"):
sequence = str(record.seq)
if fasta_convert == "RNA":
sequence = sequence.replace("u","t")
sequence = sequence.replace("U", "T")
ofile.write('>'+record.description+'\n'+sequence+'\n')
infile.close()
def f_ParseSequence(sequence):
'''Function takes in a sequence and finds the possible motifs in the sequence \
Returns: Dictionary of motif (key: known motifs, value: number of appearances), list of motifs'''
temp_dict = {}
temp_motif_list = []
with open(motif_file, "r") as mf:
for line in mf:
line = line.strip()
line = line.upper()
temp_dict.setdefault(line, 0)
counter = 0
for i in range(length_motif[-1], len(sequence)):
sub_string = sequence[counter:i]
sub_string = sub_string.upper()
if sub_string in motifDict and len(sub_string) in length_motif:
motif = motifDict.get(sub_string)
numb = motif_numb.get(motif)
temp_motif_list.append(cMotif(counter + 1, i, numb))
motif_count = temp_dict[motif]
temp_dict[motif] = motif_count + 1
for val in uniq_diff_motif_len:
sub_string = sequence[counter:i+val]
sub_string = sub_string.upper()
if sub_string in motifDict and len(sub_string) in length_motif:
motif = motifDict.get(sub_string)
numb = motif_numb.get(motif)
temp_motif_list.append(cMotif(counter + 1, i, numb))
motif_count = temp_dict[motif]
temp_dict[motif] = motif_count + 1
counter = counter + 1
return temp_dict, temp_motif_list
def f_draw(Complete_Gene_List, lonest_gene):
color_dict = {}
color_count = 1
numb_motifs = len(motif_numb)
color_step = 255 / numb_motifs
for key in motif_numb.keys():
color_val = color_count * color_step
tuple = cm.jet(int(color_val))
color_dict.setdefault(key, tuple[0:3])
color_count = color_count + 1
temp_dict = {} # key = color number / value = motif
for key in motif_numb.keys():
temp_dict.setdefault(motif_numb[key], key)
# Calculate Width
longest_motif_font_length = 12 * length_motif[0]
left_border = longest_motif_font_length + 37
num_genes = len(Complete_Gene_List)
height = num_genes * 150
width = left_border + 30 + lonest_gene
# Initialize PyCairo Surface
surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, width, height)
ct = cairo.Context(surface)
# Draw a white background
ct.set_source_rgb(1, 1, 1)
ct.rectangle(0, 0, width, height) # X1, Y1, Width, Height
ct.fill()
# Draw Legend
motif_count = 0
for key in color_dict.keys():
# Draw Motif Color Boxes
motif_height = motif_count * 15 + 20
tup_color = color_dict[key]
ct.set_source_rgb(tup_color[0], tup_color[1], tup_color[2])
ct.set_line_width(3)
ct.rectangle(10, motif_height, 15, 10)
ct.fill()
# Draw Motif Names
ct.set_source_rgb(0.1, 0.1, 0.1)
ct.select_font_face("Purisa", cairo.FONT_SLANT_NORMAL, cairo.FONT_WEIGHT_NORMAL)
ct.set_font_size(12)
ct.move_to(28, motif_height + 10)
ct.show_text(key)
motif_count = motif_count + 1
# Draw Genes
gene_count = 0
for gene in Complete_Gene_List:
# Draw Gene Line
gene_height = 100 + (gene_count * 150)
ct.set_source_rgb(0.1, 0.1, 0.1)
ct.set_line_width(3)
ct.move_to(left_border, gene_height)
ct.line_to(left_border + gene.mLength, gene_height)
ct.stroke()
# Draw Exon Box
ct.set_source_rgb(0.1, 0.1, 0.1)
ct.set_line_width(3)
exon_width = gene.mExonStop - gene.mExonStart
ct.rectangle(left_border + gene.mExonStop, gene_height - 20, exon_width, 40)
ct.stroke()
# Draw Gene Header
header_height = 45 + (gene_count * 150)
ct.set_source_rgb(0.1, 0.1, 0.1)
ct.select_font_face("Purisa", cairo.FONT_SLANT_NORMAL, cairo.FONT_WEIGHT_NORMAL)
ct.set_font_size(15)
ct.move_to(left_border, header_height)
ct.show_text(gene.mHeaderName)
# Draw Motif Lines
for mt in gene.mMotifList:
motif_string = temp_dict[mt.mColor]
motif_color = color_dict[motif_string]
ct.set_source_rgb(motif_color[0], motif_color[1], motif_color[2])
ct.set_line_width(3)
ct.move_to(left_border + mt.mStart_pos, gene_height - 10)
ct.line_to(left_border + mt.mStart_pos, gene_height + 10)
ct.stroke()
gene_count = gene_count + 1
surface.write_to_png(out_file) # Output to PNG
def main():
longest_gene_length = 0 # Keep track of the longest gene
multi2linefasta(input_file) # Converts a multi line fasta to a single lined fasta
f_SaveMotifs(motif_file) # Read motif file and stores all possible motifs into a dictionary
opened_input = open(input_file, 'r') # open input file
gene_list = [] # store the genes
gene_number = 0 # gene counter
for record in SeqIO.parse(opened_input, "fasta"):
gene_number = gene_number + 1 # Gene number is one based
capital_list = [] # list to store index of capital nucleotides
gene_name = record.id
header = record.description
sequence = record.seq
gene_length = len(sequence)
if gene_length > longest_gene_length: # Save the longest gene length
longest_gene_length = gene_length
for i in range(0, len(sequence)): # For loop keeps track of the index for every capital letter
char = sequence[i]
if char.isupper():
capital_list.append(i + 1)
if len(capital_list) == 0:
exon_start = 0
exon_stop = 0
else:
exon_start = capital_list[0] # Exon start position is one based
exon_stop = capital_list[-1] # Exon end position is one based
dict_count, list_motif = f_ParseSequence(sequence)
gene_list.append(cGene(gene_length, exon_start, exon_stop, header, list_motif, gene_number, dict_count))
f_draw(gene_list, longest_gene_length)
print("Gene header name followed by the count of motifs in each gene")
for obj in gene_list:
print(obj.mHeaderName)
print(obj.mMotifDict)
remove_temp_file = "rm singleline.fasta" # Command to move temp file
os.system(remove_temp_file) # Remove Temp File
opened_input.close() # Close File
main()