forked from CapraLab/VIPUR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpost_processing.py
276 lines (226 loc) · 13.6 KB
/
post_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#!/usr/bin/env python
# :noTabs=true:
"""
amusingly, when I built this as 3 sections, this was also in the pre_processing file
"""
################################################################################
# IMPORT
# common modules
import os
# bigger modules
# custom modules
from vipur_settings import AMINOCHANGE_GROUPS , PREDICTION_OUTPUT_HEADER
from pre_processing import load_task_summary
from psiblast_feature_generation import load_numbering_map , extract_pssm_from_psiblast_pssm
from probe_feature_generation import extract_accp_from_probe
from rosetta_feature_generation import extract_score_terms_from_ddg_monomer , extract_scores_from_scorefile , extract_quartile_score_terms_from_scorefiles
from classification import VIPUR_classifier , provide_additional_interpretation
################################################################################
# MAIN POSTPROCESSING
# ...not much it can do if the features are incomplete...
# though I suppose that is the point, feature generation should take care of this
def run_postprocessing( task_summary_filename , sequence_only = False ):
# load the tasks
if isinstance( task_summary_filename , str ) and os.path.isfile( task_summary_filename ):
task_summary = load_task_summary( task_summary_filename )
else:
# input is what we want
task_summary = task_summary_filename
# residue map
residue_map = {}
if 'numbering_map' in task_summary['filenames'].keys():
# filename = task_summary['out_path'] +'/'+ task_summary['filenames']['numbering_map']
filename = task_summary['filenames']['numbering_map']
residue_map = load_numbering_map( filename )
# extract the features
# determine the "aminochange" values
evaluate_aminochange = lambda nat , var : 2 - int( [i for i in AMINOCHANGE_GROUPS if nat in i][0] == [i for i in AMINOCHANGE_GROUPS if var in i][0] ) - int( nat == var )
# do this mapping once, rather than locate each task when needed
important_tasks = {}
for i in task_summary['commands']:
if i['feature'] == 'psiblast':
if 'psiblast' in important_tasks.keys():
raise Exception( '??? duplicate psiblast task ???' )
important_tasks['psiblast'] = i
elif i['feature'] == 'probe':
if 'probe' in important_tasks.keys():
raise Exception( '??? duplicate probe task ???' )
important_tasks['probe'] = i
elif i['feature'] == 'ddg_monomer':
if 'ddg_monomer' in important_tasks.keys():
raise Exception( '??? duplicate ddg_monomer task ???' )
important_tasks['ddg_monomer'] = i
elif i['feature'] == 'relax_native_rescore':
if 'relax_rescore_task' in important_tasks.keys():
raise Exception( '??? duplicate relax_native_rescore task ???' )
important_tasks['relax_native_rescore'] = i
elif i['feature'] == 'relax_rescore':
if 'relax_rescore_' + i['variant'] in important_tasks.keys():
raise Exception( '??? duplicate relax_rescore_' + i['variant'] +' task ???' )
important_tasks['relax_rescore_' + i['variant']] = i
#else:
# misc/relax runs...
# psiblast
# psiblast_task = [i for i in task_summary['commands'] if i['feature'] == 'psiblast']
# if not psiblast_task or not 'run' in psiblast_task[0].keys() or not 'success' in psiblast_task[0]['run']:
if not 'psiblast' in important_tasks.keys() or not 'run' in important_tasks['psiblast'].keys() or not 'success' in important_tasks['psiblast']['run']:
raise Exception( 'psiblast did not complete successfully!!!' )
pssm = extract_pssm_from_psiblast_pssm( important_tasks['psiblast']['output_filename'] )
if not residue_map:
residue_map = dict( [(str( i ) , str( i )) for i in pssm.keys()] )
for i in task_summary['variants'].keys():
if 'failed' in task_summary['variants'][i].keys():
continue
nat = i.split( '_' )[-1]
pos = nat[1:-1]
pos = int( residue_map[pos] ) + 1 # shifted from parsing
var = nat[-1]
nat = nat[0]
task_summary['variants'][i]['features'] = {
'aminochange' : evaluate_aminochange( nat , var ) ,
'pssm_native' : pssm[pos]['log-likelihood'][nat] ,
'pssm_variant' : pssm[pos]['log-likelihood'][var] ,
'pssm_difference' : pssm[pos]['log-likelihood'][nat] - pssm[pos]['log-likelihood'][var] ,
'pssm_information_content' : pssm[pos]['information content'] ,
}
if not sequence_only:
# probe
# probe_task = [i for i in task_summary['commands'] if i['feature'] == 'probe']
# if not probe_task or not 'run' in probe_task[0].keys() or not 'success' in probe_task[0]['run']:
if not 'probe' in important_tasks.keys() or not 'run' in important_tasks['probe'].keys() or not 'success' in important_tasks['probe']['run']:
raise Exception( 'probe did not complete successfully!!!' )
if not 'probe_positions' in task_summary['other']:
raise Exception( 'task summary was not written properly' )
positions = task_summary['other']['probe_positions']
# extract the PROBE feature
accp = extract_accp_from_probe( important_tasks['probe']['output_filename'] )
# positions is a parallel list of positions
accp_dict = dict( [(positions[i] , accp[i]) for i in xrange( len( accp ) )] )
for i in task_summary['variants'].keys():
if 'failed' in task_summary['variants'][i].keys():
continue
pos = i.split( '_' )[-1][1:-1]
if pos in accp_dict.keys():
task_summary['variants'][i]['features']['probe_accp'] = accp_dict[pos]
else:
continue
# ddg monomer
# ddg_monomer_task = [i for i in task_summary['commands'] if i['feature'] == 'ddg_monomer']
# if not ddg_monomer_task or not 'run' in ddg_monomer_task[0].keys() or not 'success' in ddg_monomer_task[0]['run']:
if not 'ddg_monomer' in important_tasks.keys() or not 'run' in important_tasks['ddg_monomer'].keys() or not 'success' in important_tasks['ddg_monomer']['run']:
raise Exception( 'ddg_monomer did not complete successfully!!!' )
ddg_monomer_dict = extract_score_terms_from_ddg_monomer( out_filename = important_tasks['ddg_monomer']['output_filename'] )
# the residues from this will be "pose numbered", need to map back to the PDB numbering
header = ddg_monomer_dict['description']
for i in task_summary['variants'].keys():
if 'failed' in task_summary['variants'][i].keys():
continue
mutation = i.split( '_' )[-1]
new_key = mutation[0] + str( int( residue_map[mutation[1:-1]] ) + 1 ) + mutation[-1]
for j in xrange( len( header ) ):
# print mutation
# print new_key
# print ddg_monomer_dict.keys()
task_summary['variants'][i]['features']['ddg_' + header[j]] = float( ddg_monomer_dict[new_key][j] )
# added "ddg_" for legacy compatability, is artibrary, make more informative
# relax
# get native relax reference scores
# native_task = [i for i in task_summary['commands'] if i['feature'] == 'relax_native_rescore']# and i['variant'] == 'native']
# if not native_task or not 'run' in native_task[0].keys() or not 'success' in native_task[0]['run']:
if not 'relax_native_rescore' in important_tasks.keys() or not 'run' in important_tasks['relax_native_rescore'].keys() or not 'success' in important_tasks['relax_native_rescore']['run']:
raise Exception( 'relax for the native structure reference did not complete successfully!!!' )
native_scorefile_dict = extract_scores_from_scorefile( important_tasks['relax_native_rescore']['output_filename'] ) # save time, only parse this once
for i in task_summary['variants'].keys():
if 'failed' in task_summary['variants'][i].keys():
continue
# this file SHOULD exist for every variant
# if it does not, there was a problem earlier
variant_structure = task_summary['variants'][i]['structure_filename']
# note: this current version assumes 1:1 for variant structures and
# variants...could change in the future...
mutation = i.split( '_' )[-1]
# rescore_task = [j for j in task_summary['commands'] if j['feature'] == 'relax_rescore' and j['variant'] == mutation]
# if not rescore_task or not 'run' in rescore_task[0].keys() or not 'success' in rescore_task[0]['run']
if not 'relax_rescore_' + mutation in important_tasks.keys() or not 'run' in important_tasks['relax_rescore_' + mutation].keys() or not 'success' in important_tasks['relax_rescore_' + mutation]['run']:
raise Exception( 'rescore (or relax?) did not complete successfully!!!' )
# extract features, use the quartile method to extract comparisons
# between the native and variant score distributions
quartile_scores = extract_quartile_score_terms_from_scorefiles( important_tasks['relax_rescore_' + mutation]['output_filename'] , native_scorefile_dict )
# make sure there is not overlap
overlaps = [j for j in quartile_scores.keys() if j in task_summary['variants'][i]['features'].keys()]
if overlaps:
raise IOError( '!!?! somehow your scorefile has score terms that match the name(s) of features that already exist!!?! something has gone horribly wrong...' + str( overlaps ) )
task_summary['variants'][i]['features'].update( quartile_scores )
############
# run the classifier
# combine and prune feature vector
print 'classifying the variants as \"deleterious\" or \"neutral\"'
# calculate scores using the logistic regression parameters
predictions = {}
for i in task_summary['variants'].keys():
if 'failed' in task_summary['variants'][i].keys():
# add as a failure
predictions[i] = '\t'.join( [i.split( '/' )[-1] , i , 'failed: ' + task_summary['variants'][i]['failed'] , str( 0 )] ) # added feb 2016 to match below
continue
print 'Variant: ' + i.split( '/' )[-1]
if sequence_only:
# just the sequence only classifier
# standardize output, add in empty values
interpretation = VIPUR_classifier.sequence_classifier.summarize_classification( task_summary['variants'][i]['features'] )
interpretation['structure label'] = 'NA'
interpretation['structure P'] = 'NA'
interpretation['sequence label'] = interpretation['label']
interpretation['sequence P'] = interpretation['P']
interpretation['exposure'] = 'NA'
essential_score = 'NA'
task_summary['variants'][i]['features']['ddg_total'] = 'NA'
interpretation['interpretation'] = 'NA'
interpretation['explanation'] = ['NA']
else:
# full prediction
interpretation = VIPUR_classifier.interpret_classification( task_summary['variants'][i]['features'] )
provide_additional_interpretation( interpretation ) # adds directly into the dict
essential_score = interpretation['P'] - interpretation['structure P']
task_summary['variants'][i]['interpretation'] = interpretation
# output append
predictions[i] = '\t'.join( [str( j ) for j in [
i.split( '/' )[-1] , # added jan 2016
i ,
interpretation['label'] ,
interpretation['P'] ,
# interpretation['P*'] , # not now
# interpretation['score'] , # not now
interpretation['structure label'] ,
interpretation['structure P'] ,
interpretation['sequence label'] ,
interpretation['sequence P'] ,
interpretation['exposure'] ,
# interpretation['conservation'] ,
essential_score ,
task_summary['variants'][i]['features']['ddg_total'] ,
interpretation['interpretation'] ,
', '.join( interpretation['explanation'] )
# + interpretation['terms'] # not now...
] ] )
# write output predictions etc.
prediction_filename = task_summary['filenames']['prediction_filename']
if not prediction_filename:
prediction_filename = task_summary['filenames']['pdb_filename']
prediction_filename = prediction_filename.replace( '.pdb' , '' ).replace( '.fa' , '' ).replace( '.fasta' , '' ) + '.predictions'
task_summary['filenames']['prediction_filename'] = prediction_filename
f = open( prediction_filename , 'w' )
# very shameful, hardcoded indices again...
temp = predictions.values()
temp = sorted( temp , key = lambda x : 1/(float( x.split( '\t' )[3] ) + 1e-7) ) # make sure no /0
f.write( '\t'.join( PREDICTION_OUTPUT_HEADER ) +'\n'+ '\n'.join( temp ) )
f.close()
# debug analyze time
# for i in xrange( len( debug_time ) - 1 ):
# dt = debug_time[i + 1][1] - debug_time[i][1]
# minutes = int( dt/60 )
# dt = dt - minutes*60
## print debug_time[i + 1][0].ljust( 20 ) + str( round( debug_time[i + 1][1] - debug_time[i][1] , 4 ) ) +'s'
# print debug_time[i + 1][0].ljust( 40 ) + str( minutes ) + 'min ' + str( round( dt , 4 ) ) +'s'
# write out task summary? any new details added?
# meh, should be derivative analysis using post_processing anyway...
return task_summary