-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.py
250 lines (191 loc) · 11.4 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import commands,os,sys
from conclude_prediction import *
from run2meka import *
import subprocess
path_to_blast='' #already installed
blast_eval=sys.argv[2]
blast_core=sys.argv[3]
ram_usage=str(int(sys.argv[4])*1000)
blast_hit=sys.argv[5]
print('PhotoMod2 v 1.0')
print('Photosynthetic protein classifier')
print('=====================================')
print('- Usage: python run.py [test_protein.fasta] [blast evalue] [cpu] [ram(GB)] [blast hit]')
print('- Output: (1) prediction_report.tab: containing photosynthetic protein classification result , (2) GO_prediction.tab: containing photosynthesis-related function prediction result')
print('=====================================')
print('============= Update ================')
print('- add PhotoMod2 multi-label model according to the second manuscript')
print('- update PhotoMod1 binary model according to the first manuscript')
print('=====================================')
def dict_24_GO_terms(csv_GO):
dict_24GO={}
for i in open(csv_GO):
i=i.rstrip()
j=i.split(',')
dict_24GO[j[0]]=j[1]
return dict_24GO
def check_header_feature(file1,file2): #chech identity of AFff header by sampling features
row_f1=commands.getoutput('tail -n+3 '+file1+' ').split('\n')
row_f2=commands.getoutput('tail -n+3 '+file2+' ').split('\n')
if (row_f1[0] == row_f2[0]) and (row_f1[100] == row_f2[100]) and (row_f1[1450] == row_f2[1450]) and (row_f1[7000] == row_f2[7000]) and (row_f1[10000] == row_f2[10000]) and (row_f1[18000] == row_f2[18000]) :
return True
else:
return False
def select_only_new_query(input_file,index_prot_file): #to separate between identified protein (have been collected) and the new proteins
file_index=open(index_prot_file,'r')
read_index=file_index.read()
file_index.close()
read_index=read_index.split('\n')
file_match=open('matched_prot_file.query','w')
file_unmatch=open('unmatched_prot_file.query','w')
for i in open(input_file):
i=i.replace('\n','')
j=i.split()
try:
prot_match_index=read_index.index(j[1])
file_match.write(i+'\t'+str(prot_match_index)+'\n')
except ValueError:
file_unmatch.write(i+'\n')
file_match.close()
file_unmatch.close()
def select_identified_query(index_prot_file,feature_file,header_file): #to separate between identified protein (have been collected) and the new proteins
file_index=open(index_prot_file,'r') #index_prot_file is [query \t index] file
read_index=file_index.read()
file_index.close()
read_index=read_index.split('\n')
#make list of file that contain feature
feature_list=open(feature_file,'r')
read_feature=feature_list.read()
feature_list.close()
read_feature=read_feature.split('\n')
file_match=open('matched_prot_query.arff','w')
for i in read_index:
if i != '':
i=i.split() #i[0] => query name , i[1] => index
file_match.write(read_feature[int(i[8])]+'\n')
file_match.close()
commands.getoutput('cat '+header_file+' matched_prot_query.arff > matched_query.arff')
def blast_query_to_MyDB(list_query_fasta,DB_name,evalue_cutoff):#DB_name our photosynthetic genes in blast DB format, max_seq =max_target_seqs
#blastp using query gene file
blastp=path_to_blast+"blastp -db "+DB_name+" -query "+list_query_fasta+" -out eval_matched2DB.query -max_hsps 1 -max_target_seqs "+blast_hit+" -outfmt '6 qseqid sseqid evalue pident qstart qend sstart send' -num_threads "+str(blast_core)
print "#blastp using query gene file"
print blastp
os.system(blastp)
print commands.getoutput("awk '$3<="+str(evalue_cutoff)+"' eval_matched2DB.query > matched2DB.query") #filter by evalue
print commands.getoutput("rm eval_matched2DB.query")
print "blastp finished!!!"
inputfile=open(sys.argv[1],'r')
read_input=inputfile.read()
read_input=read_input.split('>')
inputfile.close()
dict_24GO = dict_24_GO_terms('./update_MEKAmodel/24selected_GO_Photosynthesis.csv')
collected_MEGO_out='NO.\tName\tGO\tDescription\tProbability\tMethod\n'
binaryout=open('prediction_report.tab','w')
binaryout.write('Query totalBlastHits PhotoHits(%) nonphotoHits(%) noPredictHits prediction Ave_Prob_prediction\n')
binaryout.close()
#############################################################################################
count=1
for seq in read_input:
if seq != '':
wfasta=open('1_query_file.fasta','w')
wfasta.write('>'+seq)
wfasta.close()
print '>'+seq
prot_name=seq.rstrip().split('\n')[0].split()[0]
#1. Blast target proteins to our proteome
path2db = './OnlyGenome/all_154genomes.fasta'
blast_query_to_MyDB("1_query_file.fasta",path2db,blast_eval) #change to evalue cutoff criteria
#in the case of new genes
if int(commands.getoutput("grep '' -c matched2DB.query")) > 0:
path_DB='./collected_NB_feature/'
select_only_new_query('matched2DB.query',path_DB+'index_prot.nb') #separate already identified prot and new prot
#filter the new protein
#######################################
#get NBdata from DB if mached protein is found, or use the new prot profile to be a final query file if no matched prot
print 'matched_prot_file.query => ',int(commands.getoutput('grep "" -c matched_prot_file.query' ))
print 'unmatched => ', int(commands.getoutput("grep '' -c unmatched_prot_file.query"))
if int(commands.getoutput("grep '' -c unmatched_prot_file.query")) > 0 :#check wether there are matched prot found
os.system("python export_GO.py 24240 cds_MCL_cluster_evalue10")
os.system("python make_metrix_conserve_makeprediction.py 10 './call_neighbor_3evalue/e10.C.uniq_nb_id.id' no_label 0.41079 2.61799")
os.system("rm cluster_export.txt GO_neighbor_enrichment.db *.out *.out2")
os.system("rm -r all_neighboring conservation_score")
os.system("python export_GO.py 39068 cds_MCL_cluster_evalue50")
os.system("python make_metrix_conserve_makeprediction.py 50 './call_neighbor_3evalue/e50.C.uniq_nb_id.id' no_label 0.41079 2.61799")
os.system("rm cluster_export.txt GO_neighbor_enrichment.db *.out *.out2")
os.system("rm -r all_neighboring conservation_score")
os.system("python export_GO.py 39896 cds_MCL_cluster_evalue100")
os.system("python make_metrix_conserve_makeprediction.py 100 './call_neighbor_3evalue/e100.C.uniq_nb_id.id' photo 0.41079 2.61799")
os.system("rm cluster_export.txt GO_neighbor_enrichment.db *.out *.out2")
os.system("rm -r all_neighboring conservation_score")
os.system("python concat_feature.py metrix_cv_nb_predicted_e10.csv metrix_cv_nb_predicted_e50.csv no_label >metrix_cv_nb_e10_50.csv")
os.system("python check_file_combind.py metrix_cv_nb_predicted_e10.csv metrix_cv_nb_predicted_e50.csv metrix_cv_nb_e10_50.csv")
os.system("python concat_feature.py metrix_cv_nb_e10_50.csv metrix_cv_nb_predicted_e100.csv photo >metrix_cv_nb_3evalue.csv")
os.system("python check_file_combind.py metrix_cv_nb_e10_50.csv metrix_cv_nb_predicted_e100.csv metrix_cv_nb_3evalue.csv")
if int(commands.getoutput('grep "" -c metrix_cv_nb_3evalue.csv' )) >= 2: #to check whether the file contain neighbor profile
os.system("python add_absent_of_selected_feature.py metrix_cv_nb_3evalue.csv comb.csv")
#os.system("cp comb.temp "+str(count)+"_collect2DB.csv") #to copy for collectig into DB
#in the case of already identified genes
os.system("python file_convertor.py comb.temp 2_query")
os.system("rm converted_input.arff")
#check file identity between the new file and in DB file
print commands.getoutput('sed s/" \' "/" "/g 2_query_converted_input.arff |sed s/"\' "/" "/g |sed s/"\'"/""/g > 3_query_converted_input.arff')#remove quate
data_check = check_header_feature('3_query_converted_input.arff' , path_DB+'head_arff.nb')
print 'Header checking ... ', data_check
#add data to the DB
if data_check==True:
print commands.getoutput('tail -n+18856 3_query_converted_input.arff > new_arff_data.arff')
print commands.getoutput('mv '+path_DB+'body_feature.nb '+path_DB+'body_feature.nb.previous')
print commands.getoutput('cat '+path_DB+'body_feature.nb.previous new_arff_data.arff > '+path_DB+'body_feature.nb')
#update index proteins
print commands.getoutput('mv '+path_DB+'index_prot.nb '+path_DB+'index_prot.nb.previous')
print commands.getoutput('cut -f1 -d"," metrix_cv_nb_3evalue.csv | tail -n+2 > updated_proteinindex.txt')
print commands.getoutput('cat '+path_DB+'index_prot.nb.previous updated_proteinindex.txt > '+path_DB+'index_prot.nb')
print commands.getoutput('rm updated_proteinindex.txt ')
if int(commands.getoutput('grep "" -c matched_prot_file.query' )) > 0 :#check wether there are matched prot found
select_identified_query('matched_prot_file.query',path_DB+'body_feature.nb',path_DB+'head_arff.nb')
if int(commands.getoutput("grep '' -c unmatched_prot_file.query")) > 0: #to check whether the file contain neighbor profile
if os.path.exists('./metrix_cv_nb_3evalue.csv') == True :
if int(commands.getoutput('grep "" -c metrix_cv_nb_3evalue.csv' )) >= 2 :
print commands.getoutput('cat matched_query.arff new_arff_data.arff > query_final.arff')
else:
print commands.getoutput('mv matched_query.arff query_final.arff')
elif int(commands.getoutput("grep '' -c unmatched_prot_file.query")) > 0: #to check whether the file contain neighbor profile
if int(commands.getoutput('grep "" -c metrix_cv_nb_3evalue.csv' )) >= 2 :
print commands.getoutput('cat '+path_DB+'head_arff.nb new_arff_data.arff > query_final.arff')
else:
print ' NO both matched protein and unmatched proteins..... !!!!! '
#prediction
print('Run PhotoMod1...')
cmd="java -cp ./weka-3-9-1/weka.jar -Xmx"+str(ram_usage)+"m -Xss10000m weka.Run weka.classifiers.meta.FilteredClassifier -T query_final.arff -o -l current_model.model -classifications weka.classifiers.evaluation.output.prediction.CSV > prediction.csv"
ps_weka=subprocess.Popen(cmd,shell=True,stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE,close_fds=True)
out_weka=ps_weka.stdout.read()
conclude_result()
########################### Run MEKA model ###################
print('Run PhotoMod2...')
status_MEGO=run_MEGO('query_'+str(count),'./update_MEKAmodel/allGO_multilabel_uniq_ready_FS_u.arff.csv','./update_MEKAmodel/allGO_multilabel_uniq_ready_FS_u.arff','query_final.arff') #label file as input
if status_MEGO=='Photo-GO found':
print 'Photo-GO found'
for h in open('query_'+str(count)+'_MEKA_go.go'):
h=h.rstrip()
if h != '':
h=h.split('\t')
collected_MEGO_out+='query_'+str(count)+'\t'+prot_name+'\t'+h[0]+'\t'+dict_24GO[h[0]]+'\t'+h[1]+'\t'+'\tphotomod2\n'
os.system('rm query_'+str(count)+'_MEKA_go.go')
else:
collected_MEGO_out+='query_'+str(count)+'\t'+prot_name+'\tNo_GO_predicted\t0.0\tphotomod\n'
#os.system("rm prediction.csv")
if os.path.exists("metrix_cv_nb_3evalue.csv") == True:
os.system("mv metrix_cv_nb_3evalue.csv metric_gene"+str(count)+".csv")
os.system("rm metrix_cv_nb_e10_50.csv")
os.system("rm comb.temp matched_query.arff new_arff_data.arff")
os.system("rm 2_query_converted_input.arff 3_query_converted_input.arff ")
else:
print 'No blast matched!!'
os.system("rm matched2DB.query")
os.system("rm matched_prot_file.query")
os.system("rm unmatched_prot_file.query")
count+=1
wfile=open('GO_prediction.tab','w')
wfile.write(collected_MEGO_out)
wfile.close()
#need to change number of feature if the number of feature changed at line "tail -n+18856 2_query_converted_input.arff > new_arff_data.arff"