forked from CapraLab/cosmis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcosmis_complex.py
478 lines (409 loc) · 16.3 KB
/
cosmis_complex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
#!/usr/bin/env python3
"""
@summary: This is the single-protein version of the COSMIS method, hence its
name 'cosmis_sp.py'. It was designed to work with proteins for which the
COSMIS scores were not already computed, because the protein structure
database used wasn't up to date. However, if there exists a structure or
homology model for the protein of interest, its COSMIS scores can still be
computed using this script.
This script assumes that residues in the PDB file are numbered according to
the amino acid sequence in UniProt.
@author: Bian Li
@contact: bian.li@vanderbilt.edu
@change: Last modified 2/12/2021.
"""
import csv
import gzip
import json
import sys
import logging
import numpy as np
from argparse import ArgumentParser
from collections import defaultdict
from Bio import SeqIO
from Bio.PDB import PDBParser, is_aa
from Bio.SeqUtils import seq1
from cosmis.utils import pdb_utils, seq_utils
def parse_cmd():
"""
Specifies command-line flags and parses command-line options.
Returns
-------
ArgumentParser
An object of type ArgumentParser containing the parsed command-line
arguments.
"""
parser = ArgumentParser()
parser.add_argument('-c', '--config', dest='config', required=True,
type=str, help='A JSON file specifying options.')
parser.add_argument('-i', '--input', dest='input', required=True,
type=str, help='''Pairs of PDB chain ID:Ensembl
transcript ID associated with the input PDB file.''')
parser.add_argument('-p', '--pdb', dest='pdb_file', required=True,
type=str, help='''PDB file containing the structure
of the protein subunits of the given transcripts.''')
parser.add_argument('-o', '--output', dest='output_file', required=True,
type=str, help='''Output file to store the COSMIS scores
of the protein.''')
parser.add_argument('-l', '--log', dest='log_file', required=False, type=str,
default='cosmis_complex.log', help='''Logging file.''')
parser.add_argument('--chain', dest='pdb_chain', default='A', type=str,
help='Chain ID of the subunit in the PBD file.')
parser.add_argument('-w', '--overwrite', dest='overwrite', required=False,
action='store_true', help='''Whether to overwrite
already computed MTR3D scores.''')
parser.add_argument('-v', '--verbose', dest='verbose', required=False,
action='store_true', help='''Whether to output verbose
data: number of contacting residues and number of
missense and synonymous variants in the neighborhood
of the mutation site.''')
return parser.parse_args()
def get_ensembl_accession(record):
"""
Convert a FASTA file record into an accession ID.
Parameters
----------
record : str
Record ID is of the format: ">CCDS2.2|Hs109|chr1"
Returns
-------
str
Accession ID that can be used as dictionary key.
"""
parts = record.id.split('.')
return parts[0]
def get_uniprot_accession(record):
"""
Convenience function to work with Biopython SeqIO.
Parameters
----------
record : Biopython SeqRecord
A Biopython SeqRecord object.
Returns
-------
str
UniProt accession code.
"""
parts = record.id.split('|')
return parts[1]
def parse_config(config):
"""
Parses the configuration file for computing COSMIS scores into a Python
dictionary.
Parameters
----------
config : json
Configuration file in JSON format.
Returns
-------
dict
A Python dictionary.
"""
with open(config, 'rt') as ipf:
configs = json.load(ipf)
# do necessary sanity checks before return
return configs
def count_variants(variants):
"""
Collects the statistics about position-specific counts of missense and
synonymous variants.
Parameters
----------
variants : list
A list of variant identifiers: ['A123B', 'C456D']
Returns
-------
dict
A dictionary where the key is amino acid position and the value is
the number of variants at this position. One dictionary for missense
variants and one dictionary for synonymous variants.
"""
missense_counts = defaultdict(int)
synonymous_counts = defaultdict(int)
for variant in variants:
vv, ac, an = variant
w = vv[0] # wild-type amino acid
v = vv[-1] # mutant amino acid
pos = vv[1:-1] # position in the protein sequence
# only consider rare variants
if int(ac) / int(an) > 0.001:
continue
if w != v: # missense variant
missense_counts[int(pos)] += 1
else: # synonymous variant
synonymous_counts[int(pos)] += 1
return missense_counts, synonymous_counts
def get_transcript_info(
uniprot_id, enst_ids, cds_dict, pep_dict, variant_dict, enst_mp_counts
):
"""
Parameters
----------
transcript
Returns
-------
"""
pep_seq = pep_dict[uniprot_id]
valid_ensts = []
for enst_id in enst_ids:
try:
cds_seq = cds_dict[enst_id].seq
except KeyError:
logging.critical(f'No CDS found for {enst_id} in Ensembl CDS database!')
continue
# skip if the CDS is incomplete
if not seq_utils.is_valid_cds(cds_seq):
print('Error: Invalid CDS.'.format(enst_id))
continue
if len(pep_seq) == len(cds_seq) // 3 - 1:
valid_ensts.append(enst_id)
if not valid_ensts:
raise ValueError(
'Error: {} are not compatible with {}.'.format(enst_ids, uniprot_id)
)
# get the one with most variable positions
max_len = 0
best_enst_ids = valid_ensts[0]
for enst_id in valid_ensts:
try:
var_pos = len(variant_dict[enst_id]['variants'])
except KeyError:
continue
if max_len < var_pos:
max_len = var_pos
best_enst_id = enst_id
# print(f'Best ENST ID for {uniprot_id} is {best_enst_id}.')
# get the amino acid sequence of the transcript
try:
# Ensembl peptide ID for the transcript
ensp_id = variant_dict[best_enst_id]['ensp'][0]
except KeyError:
logging.critical('Transcript {best_enst_id} not found in gnomAD database')
sys.exit(1)
try:
total_exp_mis_counts = enst_mp_counts[best_enst_id][-1]
total_exp_syn_counts = enst_mp_counts[best_enst_id][-2]
except KeyError:
logging.critical(f'Transcript {best_enst_id} not found in transcript variant count file.')
sys.exit(1)
# get all variants of this transcript reported in gnomAD
try:
variants = variant_dict[best_enst_id]['variants']
except KeyError:
logging.critical(f'No variants found for {best_enst_id} in gnomAD')
sys.exit(1)
# get the coding sequence of the transcript
try:
transcript_cds = cds_dict[best_enst_id].seq
except KeyError:
logging.critical(f'No CDS found for {best_enst_id} in Ensembl CDS database!')
transcript_cds = None
# check that the CDS does not contain invalid nucleotides
if not seq_utils.is_valid_cds(transcript_cds):
print('ERROR: invalid CDS for', transcript_cds)
sys.exit(1)
# get mutation rates and expected counts for each codon
transcript_cds = transcript_cds[:-3] # remove the stop codon
codon_mutation_rates = seq_utils.get_codon_mutation_rates(transcript_cds)
all_cds_ns_counts = seq_utils.count_poss_ns_variants(transcript_cds)
# tabulate variants at each site
# missense_counts and synonymous_counts are dictionary that maps
# amino acid positions to variant counts
missense_counts, synonymous_counts = count_variants(variants)
# permutation test
codon_mis_probs = [x[1] for x in codon_mutation_rates]
codon_syn_probs = [x[0] for x in codon_mutation_rates]
mis_p = codon_mis_probs / np.sum(codon_mis_probs)
syn_p = codon_syn_probs / np.sum(codon_syn_probs)
mis_pmt_matrix = seq_utils.permute_variants(
total_exp_mis_counts, len(pep_seq), mis_p
)
syn_pmt_matrix = seq_utils.permute_variants(
total_exp_syn_counts, len(pep_seq), syn_p
)
# return transcript statistics
return ensp_id, codon_mutation_rates, missense_counts, \
synonymous_counts, mis_pmt_matrix, syn_pmt_matrix, \
all_cds_ns_counts, pep_seq
def load_datasets(configs):
"""
Parameters
----------
configs
Returns
-------
"""
# ENSEMBL cds
print('Reading ENSEMBL CDS database ...')
with gzip.open(configs['ensembl_cds'], 'rt') as cds_handle:
enst_cds_dict = SeqIO.to_dict(
SeqIO.parse(cds_handle, format='fasta'),
key_function=get_ensembl_accession
)
# ENSEMBL peptide sequences
print('Reading UniProt protein sequence database ...')
with gzip.open(configs['uniprot_pep'], 'rt') as pep_handle:
pep_dict = SeqIO.to_dict(
SeqIO.parse(pep_handle, format='fasta'),
key_function=get_uniprot_accession
)
# parse gnomad transcript-level variants
print('Reading gnomAD variant database ...')
with open(configs['gnomad_variants'], 'rt') as variant_handle:
# transcript_variants will be a dict of dicts where major version
# ENSEMBL transcript IDs are the first level keys and "ccds", "ensp",
# "swissprot", "variants" are the second level keys. The value of each
# second-level key is a Python list.
enst_variants = json.load(variant_handle)
# parse the file that maps Ensembl transcript IDs to PDB IDs
with open(configs['uniprot_to_enst'], 'rt') as ipf:
uniprot_to_enst = json.load(ipf)
# get transcript mutation probabilities and variant counts
print('Reading transcript mutation probabilities and variant counts ...')
enst_mp_counts = seq_utils.read_enst_mp_count(configs['enst_mp_counts'])
return (enst_cds_dict, pep_dict, enst_variants, uniprot_to_enst, enst_mp_counts)
def main():
# parse command-line arguments
args = parse_cmd()
# configure the logging system
logging.basicConfig(
filename=args.log_file,
level=logging.INFO,
filemode='w',
format='%(levelname)s:%(asctime)s:%(message)s'
)
# parse configuration file
configs = parse_config(args.config)
# load datasets
cds_dict, pep_dict, variant_dict, uniprot_to_enst, enst_mp_counts = load_datasets(configs)
# parse chain-uniprot pairs
with open(args.input, 'rt') as ipf:
chain_uniprot_pairs = {
c: u for c, u in [line.strip().split(':') for line in ipf]
}
uniprot_id = chain_uniprot_pairs[args.pdb_chain]
# data structures for storing stats and annotations
cosmis_scores = []
ensp_ids = {}
codon_mutation_rates = {}
mis_counts = {}
syn_counts = {}
mis_pmt_matrices = {}
syn_pmt_matrices = {}
all_cds_ns_counts = {}
pep_seqs = {}
for c, u in chain_uniprot_pairs.items():
# find the right enst ID
try:
enst_ids = uniprot_to_enst[u]
except KeyError:
logging.critical(
'No transcript IDs were mapped to {}.'.format(u)
)
sys.exit(1)
try:
results = get_transcript_info(
u, enst_ids, cds_dict, pep_dict, variant_dict, enst_mp_counts
)
except ValueError:
logging.critical('No valid CDS found for {}.'.format(u))
sys.exit(1)
except KeyError:
logging.critical('No transcript record found for {} in gnomAD.'.format(u))
sys.exit(1)
ensp_ids[c] = results[0]
codon_mutation_rates[c] = results[1]
mis_counts[c] = results[2]
syn_counts[c] = results[3]
mis_pmt_matrices[c] = results[4]
syn_pmt_matrices[c] = results[5]
all_cds_ns_counts[c] = results[6]
pep_seqs[c] = results[7]
# index all contacts by residue ID
pdb_parser = PDBParser(PERMISSIVE=1)
structure = pdb_parser.get_structure(id='NA', file=args.pdb_file)
all_aa_residues = [aa for aa in structure[0].get_residues() if is_aa(aa)]
all_contacts = pdb_utils.search_for_all_contacts(all_aa_residues, radius=8)
indexed_contacts = defaultdict(list)
for c in all_contacts:
indexed_contacts[c.get_res_a()].append(c.get_res_b())
indexed_contacts[c.get_res_b()].append(c.get_res_a())
chain_id = args.pdb_chain
chain_struct = structure[0][chain_id]
for seq_pos, seq_aa in enumerate(pep_seqs[chain_id], start=1):
# check that the amino acid in ENSP sequence matches
# that in the PDB structure
try:
res = chain_struct[seq_pos]
except KeyError:
print(
'Residue %s not found in chain %s in PDB file: %s' %
(seq_pos, chain_id, args.pdb_file)
)
continue
pdb_aa = seq1(res.get_resname())
if seq_aa != pdb_aa:
print('Residue in ENSP did not match that in PDB at', seq_pos)
continue
contact_res = indexed_contacts[res]
print('Current residue:', res.get_full_id())
total_missense_obs = mis_counts[chain_id].setdefault(seq_pos, 0)
total_synonymous_obs = syn_counts[chain_id].setdefault(seq_pos, 0)
total_missense_poss = all_cds_ns_counts[chain_id][seq_pos - 1][0]
total_synonyms_poss = all_cds_ns_counts[chain_id][seq_pos - 1][1]
total_synonymous_rate = codon_mutation_rates[chain_id][seq_pos - 1][0]
total_missense_rate = codon_mutation_rates[chain_id][seq_pos - 1][1]
for c_res in contact_res:
# count the total # observed variants in contacting residues
print('\tContacts:', c_res.get_full_id())
contact_chain_id = c_res.get_full_id()[2]
j = c_res.get_full_id()[3][1]
total_missense_obs += mis_counts[contact_chain_id].setdefault(j, 0)
total_synonymous_obs += syn_counts[contact_chain_id].setdefault(j, 0)
# count the total # expected variants
try:
total_missense_poss += all_cds_ns_counts[contact_chain_id][j - 1][0]
total_synonyms_poss += all_cds_ns_counts[contact_chain_id][j - 1][1]
total_synonymous_rate += codon_mutation_rates[contact_chain_id][j - 1][0]
total_missense_rate += codon_mutation_rates[contact_chain_id][j - 1][1]
except IndexError:
print('{} not in CDS that has {} residues.'.format(j, len(
all_cds_ns_counts)))
sys.exit(1)
mis_pmt_mean, mis_pmt_sd, mis_p_value = seq_utils.get_permutation_stats(
mis_pmt_matrices, contact_res + [res], total_missense_obs
)
syn_pmt_mean, syn_pmt_sd, syn_p_value = seq_utils.get_permutation_stats(
syn_pmt_matrices, contact_res + [res], total_synonymous_obs
)
# add entries to be written to disk file
cosmis_scores.append(
[
uniprot_id, seq_pos, seq_aa,
total_synonyms_poss,
total_missense_poss,
'%.3e' % total_synonymous_rate,
total_synonymous_obs,
'%.3e' % total_missense_rate,
total_missense_obs,
'{:.3f}'.format(mis_pmt_mean),
'{:.3f}'.format(mis_pmt_sd),
'{:.3e}'.format(mis_p_value),
'{:.3f}'.format(syn_pmt_mean),
'{:.3f}'.format(syn_pmt_sd),
'{:.3e}'.format(syn_p_value),
]
)
with open(file=args.output_file, mode='wt') as opf:
header = [
'uniprot_id', 'ensp_pos', 'ensp_aa',
'cs_syn_poss', 'cs_mis_poss',
'cs_syn_prob', 'cs_syn_obs', 'cs_mis_prob', 'cs_mis_obs',
'mis_pmt_mean', 'mis_pmt_sd', 'mis_p_value', 'syn_pmt_mean',
'syn_pmt_sd', 'syn_p_value'
]
csv_writer = csv.writer(opf, delimiter='\t')
csv_writer.writerow(header)
csv_writer.writerows(cosmis_scores)
if __name__ == '__main__':
main()