-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcorrect_mfe.py
343 lines (262 loc) · 10.7 KB
/
correct_mfe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
# This script is designed to mutate the sequence whose
# RNAfold-mfe structure has the correct topology but
# partially wrong correlation in vertices.
#
# This part will be interfaced to the 'master(-gaif)' script
# in the S3 fail branch.
#
# issue:
import os
import os.path
import sys
from subprocess import check_output
# get filenames
for fname in os.listdir('.'):
if fname.endswith('.seq'):
#print fname
prefix = fname.split(".")[0]
break
inpf1 = prefix+".tg_log"
inpf2 = prefix+".mfe.tg_log"
inpf3 = prefix+".bpseq"
#prefix = inpf3.split(".")[0]
#analyze_path = "/Users/yt34/NYU_Drive_Google/Work/RNA-projects/myOwnScripts/analyzer/S1-script/"
#s2_path = "/Users/yt34/NYU_Drive_Google/Work/RNA-projects/myOwnScripts/analyzer/S2-script/"
#me_path = "/Users/yt34/NYU_Drive_Google/Work/RNA-projects/myOwnScripts/mutationEngine/"
#sub_path = "/Users/yt34/NYU_Drive_Google/Work/RNA-projects/Cases/subroutines/"
#RNAinverseDir="/Users/yt34/NYU_Drive_Google/Work/RNA-projects/GAIF/"
analyze_path = "/Users/sj78/Documents/labwork/MutationsForDesign/RAG-IF_Code/"
s2_path = "/Users/sj78/Documents/labwork/MutationsForDesign/RAG-IF_Code/"
me_path = "/Users/sj78/Documents/labwork/MutationsForDesign/RAG-IF_Code/"
sub_path = "/Users/sj78/Documents/labwork/MutationsForDesign/RAG-IF_Code/"
RNAinverseDir="/Users/sj78/Documents/labwork/MutationsForDesign/RAG-IF_Code/"
def wc(filename):
return int(check_output(["wc", "-l", filename]).split()[0])
def getSS(ex_seq): # based on NUPACK
f1 = open("tmpNupack.in","w")
f1.write(ex_seq)
f1.close()
os.system("/opt/nupack3.2.2/bin/mfe -material rna tmpNupack")
os.system("rm tmpNupack.in")
target = ""
with open("tmpNupack.mfe") as pointer:
for line in pointer:
if "((" in line:
target = line
break
if ".." in line:
target = line
break
if "))" in line:
target = line
break
os.system("rm -rf tmpNupack.mfe")
return target
def getSS_RNAfold( ex_seq ):
f1 = open("tmpRNAfold.in",'w')
f1.write(ex_seq)
f1.close()
os.system("/opt/viennaRNA2.3.5/bin/RNAfold -p -d2 --noLP < tmpRNAfold.in > tmpRNAfold.out" )
os.system("rm -rf tmpRNAfold.in rna.ps dot.ps")
# we don't need this fasta file
#f1 = open("tmpRNAfold-mfe.fa","w")
#f1.write(">test\n")
#f1.write(ex_seq)
#f1.close()
# we only take the mfe structure
counter = 0
with open("tmpRNAfold.out") as pointer:
for line in pointer:
counter = counter + 1
if counter == 2:
tmpRNA_mfe_ss = line.split()[0]
os.system("rm -rf tmpRNAfold.out")
return tmpRNA_mfe_ss
def getTopo_RNAfold( ex_seq, flag ):
#fd2seqDir = "/Users/yt34/NYU_Drive_Google/Work/RNA-projects/myOwnScripts/dotfa2bp/"
fd2seqDir = "/Users/sj78/Documents/labwork/MutationsForDesign/RAG-IF_Code/"
TGpath="/Users/sj78/Documents/labwork/MutationsForDesign/RAG-IF_Code/modified-treeGraph/"
if flag == 1:
ss_RNAfold = getSS_RNAfold( ex_seq )
if flag == 2:
ss_RNAfold = getSS( ex_seq ) # NUPACK
# prepare fasta and dotbracket files
f1 = open("tmpRNAfold-fasta-1.fa",'w')
f1.write(">test1\n")
f1.write(ex_seq)
f1.close()
f1 = open("tmpRNAfold-fasta-1.dotbracket",'w')
f1.write(ss_RNAfold)
f1.close()
# get bpseq file
os.system('rm -rf tmpRNAfold-fasta-1.bpseq')
os.system("python "+fd2seqDir+"dotfa2bpseq.py tmpRNAfold-fasta-1.fa tmpRNAfold-fasta-1.dotbracket"+">/dev/null ")
os.system("rm -rf tmpRNAfold-fasta-1.fa tmpRNAfold-fasta-1.dotbracket")
# run treeGraph
rnafold_mfe_top = ""
os.system("tail -n +2 tmpRNAfold-fasta-1.bpseq > tmp1 ")
os.system("mv tmp1 tmpRNAfold-fasta-1.bpseq")
os.system("python "+TGpath+"treeGraphs.py tmpRNAfold-fasta-1.bpseq > tmpRNAfold-fasta-1.tg_log")
with open("tmpRNAfold-fasta-1.tg_log") as pointer:
for line in pointer:
if "Graph ID" in line:
rnafold_mfe_top = line.split()[-1].strip()
os.system("rm -rf tmpRNAfold-fasta-1.bpseq")# tmpRNAfold-fasta-1.tg_log ")
#print "current topology is:", rnafold_mfe_top
return rnafold_mfe_top
#
def getTopo_forFiles( ex_seq, flag, prefix ):
#fd2seqDir = "/Users/yt34/NYU_Drive_Google/Work/RNA-projects/myOwnScripts/dotfa2bp/"
fd2seqDir = "/Users/sj78/Documents/labwork/MutationsForDesign/RAG-IF_Code/"
TGpath="/Users/sj78/Documents/labwork/MutationsForDesign/RAG-IF_Code/modified-treeGraph/"
if flag == 1:
ss_RNAfold = getSS_RNAfold( ex_seq )
if flag == 2:
ss_RNAfold = getSS( ex_seq ) # NUPACK
# prepare fasta and dotbracket files
f1 = open("tmpRNAfold-fasta-1.fa",'w')
f1.write(">test1\n")
f1.write(ex_seq)
f1.close()
f1 = open("tmpRNAfold-fasta-1.dotbracket",'w')
f1.write(ss_RNAfold)
f1.close()
# get bpseq file
os.system('rm -rf tmpRNAfold-fasta-1.bpseq')
os.system("python "+fd2seqDir+"dotfa2bpseq.py tmpRNAfold-fasta-1.fa tmpRNAfold-fasta-1.dotbracket"+">/dev/null ")
os.system("rm -rf tmpRNAfold-fasta-1.fa tmpRNAfold-fasta-1.dotbracket")
# run treeGraph
rnafold_mfe_top = ""
os.system("cp tmpRNAfold-fasta-1.bpseq tmpRNAfold-fasta-1.bpseq.bak")
os.system("tail -n +2 tmpRNAfold-fasta-1.bpseq > tmp1 ")
os.system("mv tmp1 tmpRNAfold-fasta-1.bpseq")
os.system("python "+TGpath+"treeGraphs.py tmpRNAfold-fasta-1.bpseq > tmpRNAfold-fasta-1.tg_log")
#with open("tmpRNAfold-fasta-1.tg_log") as pointer:
# for line in pointer:
# if "Graph ID" in line:
# rnafold_mfe_top = line.split()[-1].strip()
if flag == 1:
os.system("rm -rf tmpRNAfold-fasta-1.bpseq")
os.system("mv tmpRNAfold-fasta-1.bpseq.bak "+prefix+".rnafold.bpseq")
os.system("mv tmpRNAfold-fasta-1.tg_log "+prefix+".rnafold.tg_log")
if flag == 2:
os.system("rm -rf tmpRNAfold-fasta-1.bpseq") # tmpRNAfold-fasta-1.tg_log ")
os.system("mv tmpRNAfold-fasta-1.tg_log "+prefix+".nupack.tg_log")
return
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# main
# get the designed seq.
f1 = open(prefix+".seq",'r')
seq_org = f1.readline().strip()
#print seq_org
#sys.exit()
# correlate design_tg and mfe_tg
os.system("python "+analyze_path+"analyzer.py "+ inpf2 + " " + inpf1 + "> design_mfe.s1_summary" )
os.system("python "+s2_path+"s2_script.py "+"design_mfe.s1_summary "+inpf2+" "+inpf1+"> dm-ifmutate.log")
# 'yes' - to be mutated
# 'no' - not to be mutated
# run Mutation Engine
os.system("python "+me_path+"mutEngine.py "+inpf3+" dm-ifmutate.log")
#sys.exit()
ifsufficient = 0
nstepheaven = -2
while ifsufficient == 0:
# prepare the '.gaif_conf' file
nstepheaven = nstepheaven + 2
os.system("rm -rf .gaif_conf")
f1 = open(".gaif_conf","w")
f1.write("nstepheaven = "+str(nstepheaven)+"\n")
f1.close()
# Run GAIF
os.system("python "+RNAinverseDir+"gaif.py "+prefix+".rnaInverseInp "+prefix+".seq " )
#sys.exit()
#
# python /Users/yt34/NYU_Drive_Google/Work/RNA-projects/GAIF/gaif.py Top104.rnaInverseInp Top104.seq
#
# check whether the sequences in 'heaven.txt' are sufficient or not
if os.path.isfile("heaven.txt"):
nseq = wc("heaven.txt")
print nseq,"sequences are in heaven.txt"
if nseq > 100:
ifsufficient = 1
else:
continue
else: # if heaven.txt does not even exist!
continue
# After this, we may have a lot of sequences in the 'heaven.txt' file, we need to find just a few
# successful sequences that have all vertices correlated with the designed one
# we need to check the following:
# 1) topology type (RNAfold)
# 2) whether all vertices are correlated (RNAfold and design)
# 3)* whether this has been already a successful one which can pass the verification of NuPACK [optional]
#
# Current optimal way is to find the seuquence with minimal mutation to replace the originally designed
# sequence.
seqs = []
with open("heaven.txt") as f: # read in sequences generated by GAIF
for line in f:
if len(line) > 2:
seq = line.split()[0].strip()
seqs.append(seq)
target_topo = ""
with open(inpf1) as f: # get the target topology
for line in f:
if "Graph ID" in line:
target_topo = line.split()[-1].strip()
#print target_topo
#sys.exit()
min_mut = 99
min_mut_seq = ''
for i in range(len(seqs)):
ex_seq = seqs[i]
current_topo = getTopo_RNAfold( ex_seq, 1 ) # "tmpRNAfold-fasta-1.tg_log" is left
#print "debug",current_topo , target_topo
if current_topo == target_topo:
# correlate vertices
os.system("python "+analyze_path+"analyzer.py "+"tmpRNAfold-fasta-1.tg_log "+inpf1+ ">design_mfe.s2_summary")
with open("design_mfe.s2_summary") as f:
if "All vertices have been analyzed" in f.read():
# find the sequence with minimal mutations
count = 0
for x,y in zip(seq_org, ex_seq):
if x != y:
count = count + 1
#print count
if count < min_mut:
min_mut = count
min_mut_seq = ex_seq
#print min_mut
# check NuPACK topo. -- this does not work for 8_12, but might work for others
#nupack_topo = getTopo_RNAfold( ex_seq, 2 )
#print nupack_topo
#
os.system("rm -rf tmpRNAfold-fasta-1.tg_log design_mfe.s2_summary ")
#sys.exit()
#
# - - - END OF FOR LOOP - - - - - -
if min_mut == 99:
print "No good sequences.\nStop!"
f4 = open("STOP_SIGNAL","w")
f4.write("stop at S3 fail branch")
f4.close()
sys.exit()
#
#
# dump the current sequence as a file
print "find minimal mutations:", min_mut
f2 = open("mut-"+prefix+".seq","w")
f2.write(min_mut_seq) # +" "+str(min_mut))
f2.close()
# in the end, "mut-[prefix].seq" will be generated
# rename and clean-up
os.system("mv "+prefix+".rnaInverseInp "+ prefix+".rnaInverseInp.s3_fail ")
#backup necessary files
os.system("mv "+prefix+".nupack.tg_log "+prefix+".nupack.tg_log.bak ")
os.system("mv "+prefix+".rnafold.tg_log "+prefix+".rnafold.tg_log.bak ")
os.system("mv "+prefix+".rnafold.bpseq "+prefix+".rnafold.bpseq.bak ")
# obtain the above three files
# 1. [prefix].rnafold.seq <-
#
getTopo_forFiles( min_mut_seq, 1, prefix ) # rnafold
getTopo_forFiles( min_mut_seq, 2, prefix ) # nupack
#