-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathentry_point.py
408 lines (342 loc) · 15.2 KB
/
entry_point.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
from distutils import dir_util
import os
import shutil
import sys
import logging as log
import numpy as np
# Add repo to path!
MAIN_PACKAGE_DIR = os.path.abspath(os.curdir)
sys.path.append(MAIN_PACKAGE_DIR)
try:
from graph_cnn.data_prep import data_generator
except:
log.debug('failed to import data_generator from graph_cnn.data_prep')
pass
try:
from cli_arguments import ModelingParser
except:
log.debug('failed to import ModelingParser from cli_arguments')
pass
try:
from graph_cnn.model import GraphCNN
except:
log.debug('failed to import GraphCNN from graph_cnn.model')
pass
try:
from graph_cnn.run_model import runModel, runGNN
except:
log.debug('failed to import runModel and runGNN from graph_cnn.run_model')
pass
try:
from data_files.TMdomains.UniprotScrape import scrape_TMs
except:
log.debug('failed to import scrape_TMs from data_files.TMdomains.UniprotScrape')
pass
try:
from RF.CombineLigandsProteins import develop_matrices, features_matrix
except:
log.debug('failed to import develop_matrices and features_matrix from RF.CombineLigandsProteins')
pass
try:
from RF.FixedClassificationModel import train
except:
log.debug('failed to import train from RF.FixedClassificationModel')
pass
try:
from graph_cnn.hp_model import optimizeHyperparameters
except:
log.debug('failed to import optimizeHyperparameters from graph_cnn.hp_model')
pass
try:
import config
except:
log.debug('failed to import config')
pass
#Create temporary folders to house user-input necessary files
def createTemporaryDirectories():
os.mkdir(os.path.join(MAIN_PACKAGE_DIR, 'temp_protein_bgf'))
os.mkdir(os.path.join(MAIN_PACKAGE_DIR, 'temp_ligand_adj_npy'))
os.mkdir(os.path.join(MAIN_PACKAGE_DIR, 'temp_ligand_feat_npy'))
os.mkdir(os.path.join(MAIN_PACKAGE_DIR, 'temp_protein_adj_npy'))
os.mkdir(os.path.join(MAIN_PACKAGE_DIR, 'temp_protein_feat_npy'))
def createRFDirectories():
os.mkdir(os.path.join(MAIN_PACKAGE_DIR, 'temp_aa'))
os.mkdir(os.path.join(MAIN_PACKAGE_DIR, 'temp_3Di'))
os.mkdir(os.path.join(MAIN_PACKAGE_DIR, 'temp_smiles'))
os.mkdir(os.path.join(MAIN_PACKAGE_DIR, 'temp_TMs'))
#Remove temporary folders
def removeTemporaryDirectories():
# FIXME make it catch exception when the directory doesn't exist
shutil.rmtree(os.path.join(MAIN_PACKAGE_DIR, 'temp_protein_bgf'))
shutil.rmtree(os.path.join(MAIN_PACKAGE_DIR, 'temp_ligand_adj_npy'))
shutil.rmtree(os.path.join(MAIN_PACKAGE_DIR, 'temp_ligand_feat_npy'))
shutil.rmtree(os.path.join(MAIN_PACKAGE_DIR, 'temp_protein_adj_npy'))
shutil.rmtree(os.path.join(MAIN_PACKAGE_DIR, 'temp_protein_feat_npy'))
def removeRFDirectories():
shutil.rmtree(os.path.join(MAIN_PACKAGE_DIR, 'temp_aa'))
shutil.rmtree(os.path.join(MAIN_PACKAGE_DIR, 'temp_3Di'))
shutil.rmtree(os.path.join(MAIN_PACKAGE_DIR, 'temp_smiles'))
shutil.rmtree(os.path.join(MAIN_PACKAGE_DIR, 'temp_TMs'))
def generateNpyMatrices(protein_path='input_protein_pdb', ligand_path='input_ligand_mol'):
data_generator.generateProteinMatrices(
pdb_path=protein_path,
bgf_path='temp_protein_bgf',
target_adj_path='temp_protein_adj_npy',
target_feat_path='temp_protein_feat_npy'
)
data_generator.generateLigandMatrices(
mol_path=ligand_path,
target_adj_path='temp_ligand_adj_npy',
target_feat_path='temp_ligand_feat_npy'
)
#Create a list of every protein-ligand pair in the folders
def generateLabelsList(protein_folder='input_protein_pdb', ligand_folder='input_ligand_mol'):
protein_files = os.listdir(protein_folder)
mol_files = os.listdir(ligand_folder)
X_list = []
for p_file in protein_files:
if p_file.endswith('.pdb'):
for m_file in mol_files:
if m_file.endswith('.mol'):
X_list.append([p_file[:-4], m_file[:-4]])
return X_list
def savePredictions(label_list, results):
with open('predeicted_results.txt', 'w') as results_file:
for i in range(len(label_list)):
results_file.write(
str(label_list[i][0]) + ',' + str(label_list[i][1]) + ',' + str(results[i]) + '\n'
)
def make_accession_list(proteins, protein_structure_folder):
with open(proteins, 'w') as f:
protein_files = os.listdir(protein_structure_folder)
for p_file in protein_files:
if p_file.endswith('.pdb'):
printline = p_file.replace('AF-', '')
printline = printline.replace('-F1-model_v2.pdb', '')
print(printline, file = f)
def ppp():
parser = ModelingParser()
parser.setup_arguments()
args = parser.parse_args()
if args.batch_size:
batch_size = args.batch_size
else:
batch_size = -1
if args.fitting_batch_size:
fitting_batch_size = args.fitting_batch_size
else:
fitting_batch_size = 64
if args.optimizer:
optimizer = args.optimizer
else:
optimizer = 'adam'
if args.learning_rate:
learning_rate = args.learning_rate
else:
learning_rate = 0.001
if args.dropout:
dropout = args.dropout
else:
dropout = 0.2
if args.test_train_split:
test_train_split = args.test_train_split
else:
test_train_split = 0.15
if args.validation_split:
validation_split = args.validation_split
else:
validation_split = 0.15
if args.callbacks:
callbacks = args.callbacks
else:
callbacks = True
hparams = {
config.HP_OPTIMIZER: optimizer,
config.HP_LEARNINGRATE: learning_rate,
config.HP_BATCH_SIZE: fitting_batch_size,
config.HP_DROPOUT: dropout,
config.HP_TEST_TRAIN_SPLIT: test_train_split,
config.HP_VALIDATION_SPLIT: validation_split,
'callbacks': callbacks
}
if (args.gnn_mode) or (args.model == 'gnn'):
classification = args.gnn_cl == True
if args.gnn_mode == 'hptuning':
optimizeHyperparameters(classification, hparams)
elif args.gnn_mode == 'eval_tuple':
X = generateLabelsList()
createTemporaryDirectories()
log.info('Generated BGF and MOL files in temp directories.')
try:
generateNpyMatrices()
log.info('Generated NPY arrays')
temp_folders=[
'temp_protein_adj_npy',
'temp_protein_feat_npy',
'temp_ligand_adj_npy',
'temp_ligand_feat_npy'
]
g = GraphCNN()
g.initialize()
temp_tensors, dummy_y = g.getTensors(X, ['0']*len(X), temp_folders)
model = runModel(batch_size=batch_size, classification=classification)
predicted_value = runGNN(model, temp_tensors)
log.info('The predicted binding affinity is ' + str(predicted_value))
print('The predicted value is ', predicted_value)
finally:
removeTemporaryDirectories()
elif args.gnn_mode == 'eval_protein':
X = generateLabelsList(ligand_folder=config.MOL_FILES_PATH)
createTemporaryDirectories()
try:
generateNpyMatrices(ligand_path=config.MOL_FILES_PATH)
log.info('Generated NPY arrays')
temp_folders=[
'temp_protein_adj_npy',
'temp_protein_feat_npy',
'temp_ligand_adj_npy',
'temp_ligand_feat_npy'
]
g = GraphCNN()
g.initialize()
temp_tensors, dummy_y = g.getTensors(X, ['0']*len(X), temp_folders)
model = runModel(batch_size=batch_size, classification=classification)
predicted_values = runGNN(model, temp_tensors)
log.info('The predicted binding affinity is ' + str(predicted_values))
print('The predicted value is ', predicted_values)
savePredictions(X, predicted_values)
finally:
removeTemporaryDirectories()
elif args.gnn_mode == 'eval_ligand':
pass
else:
model = runModel(batch_size, classification=classification)
elif args.model == 'cnn':
print('CNN CLI is not implemented yet!')
elif (args.rf_mode) or (args.model == 'rf'):
print('RF CLI is not implemented yet!')
protein_structure_folder='input_protein_pdb'
Di_fasta = 'foldseek/outputDb_ss.fasta'
protein_sequence_folder='input_protein_fasta'
ligand_folder='input_ligand_smiles'
ligand_csv = 'input_ligand_smiles/smiles.csv'
proteins = 'temp_TMs/accessions.txt'
TMs = 'temp_TMs/TM.txt'
TM_csv = 'temp_TMs/TM.csv'
experimental_results = 'input_results'
accession_to_ensemble = 'ensemble_to_accession.csv'
if args.rf_mode == 'eval_pairs':
print('eval_pairs not implemented')
try:
createRFDirectories()
except:
print('Failed to make temporary directories')
try:
make_accession_list(proteins, protein_structure_folder)
log.info("Made list of accessions")
except:
print('Failed to create list of protein accessions')
if not os.path.exists(protein_structure_folder):
print('Please input pdb files into a folder in your working directory called input_protein_pdb')
try:
scrape_TMs(proteins, TMs, TM_csv)
log.info('Scraped TMs')
except:
print('Unable to scrape TMs')
if not os.path.exists(proteins):
print('Cannot find list of accession names')
elif not os.path.exists(TMs):
print('Failed to create txt file of TM domains')
elif not os.path.exists(TM_csv):
print('Failed to create csv file of TM domains')
try:
features, proteins, ligands = features_matrix(ligand_csv, TM_csv, Di_fasta, accession_to_ensemble)
log.info('Created feature matrix')
except:
print('Unable to create feature matrix')
if not os.path.exists(ligand_csv):
print('Please upload a csv of ligand smiles into the file path: "input_ligand_smiles/smiles.csv" with format Ligands,SMILE')
elif not os.path.exists(Di_fasta):
print("Please download foldseek from https://github.com/steineggerlab/foldseek")
print("Create a database of 3Di sequences for each protein by following the directions in the HowToConvertTo3Di.txt document")
elif not os.path.exists(accession_to_ensemble):
print("Please input a file mapping ensemble id to accession id named ensemble_to_accession.csv")
try:
result = develop_matrices('Ligands_withSMILE/ligand_SMILEs.csv', "data_files/TMdomains/TM.csv",
"data_files/3DiSequences/fullset_ss.fasta", "olfr_de", "data_files/uniprot_ensemble.csv")
log.info('Developed training matrices')
except:
print('Unable to develop training matrices')
try:
acc, rec, bac, TN, FN, TP, FP, log_loss, clf = train(result['X'], result['Y'], False)
log.info('Trained model')
except:
print('Unable to train model')
try:
y_pred=clf.predict_proba(features)[:,1]
print(proteins)
print(ligands)
print(y_pred)
log.info('Formed predictions')
except:
print('Unable to form predictions')
finally:
removeRFDirectories()
log.info('Removed temporary directories')
else:
try:
createRFDirectories()
except:
print('Failed to make temporary directories')
try:
make_accession_list(proteins, protein_structure_folder)
log.info("Made list of accessions")
except:
print('Failed to create list of protein accessions')
if not os.path.exists(protein_structure_folder):
print('Please input pdb files into a folder in your working directory called input_protein_pdb')
try:
scrape_TMs(proteins, TMs, TM_csv)
log.info('Scraped TMs')
except:
print('Unable to scrape TMs')
if not os.path.exists(proteins):
print('Cannot find list of accession names')
elif not os.path.exists(TMs):
print('Failed to create txt file of TM domains')
elif not os.path.exists(TM_csv):
print('Failed to create csv file of TM domains')
try:
result = develop_matrices(ligand_csv, TM_csv, Di_fasta, experimental_results, accession_to_ensemble)
log.info('Created input matrices')
except:
print('Unable to create input matrices')
if not os.path.exists(ligand_csv):
print('Please upload a csv of ligand smiles into the file path: "input_ligand_smiles/smiles.csv" with format Ligands,SMILE')
elif not os.path.exists(Di_fasta):
print("Please download foldseek from https://github.com/steineggerlab/foldseek")
print("Create a database of 3Di sequences for each protein by following the directions in the HowToConvertTo3Di.txt document")
elif not os.path.exists(experimental_results):
print("Please input csv files titled by each of the ligands containing data on ensembl_gene_id, logFC, and FDR for each protein")
elif not os.path.exists(accession_to_ensemble):
print("Please input a file mapping ensemble id to accession id named ensemble_to_accession.csv")
try:
acc, rec, bac, TN, FN, TP, FP, log_loss, clf = train(result['X'], result['Y'], False)
print('ROC-AUC')
print(acc)
print('Precision-Recall AUC')
print(rec)
print('Balanced Accuracy')
print(bac)
print('Binary Cross Entropy')
print(log_loss)
except:
print("There was an error in training or testing the model")
finally:
removeRFDirectories()
log.info('Removed temporary directories')
else:
print('error: the following arguments are missing: model')
ppp()
#data_generator.generateLigandMatrices()
#data_generator.generateProteinMatrices()