-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdistance.py
705 lines (571 loc) · 30.5 KB
/
distance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
"""
distance.py
Distance functions to be called by the analysis functions.
TODO:
- Standardize inputs to distance functions to make the architecture more "plug-n-play"
- Release a version of seqtools with the required functions to satisfy dependencies
Generally a distance function should have the following inputs:
- Sieve data object
- Parameters
(Q: I use a dict of params to simplify the arguments, but is it confusing that each distance function requires different params?)
and should return either:
- A 2D [nPTIDs x nSites] pd.DataFrame
- A 3D [nPTIDs x nSites x nParams] numpy.ndarray for multiple parameter values
NOTES:
Currently it looks as though with low nPerms and high simulation repeats prepareBA and binding distance computations
are the speed bottlenck. I could probably speed up the distance comp by using the 4D MATLAB style BA matrix and numexpr
since this would vectorize the <, >, and sum operations."""
from seqdistance.matrices import binarySubst, addGapScores
from seqdistance import hamming_distance
import numpy as np
import pandas as pd
import itertools
from HLAPredCache import *
from decimal import Decimal
__all__=['_vxmatch_distance',
'_binding_escape_distance',
'_nepitope_distance',
'_indel_escape_distance',
'_prepareSeqBA',
'_prepareAlignBA',
'_prepareBTBA',
'_prepareBA',
'_similarity_score',
'_findall',
'_binding_scan_distance',
'_indel_scan_distance',
'_epitope_mismatch_distance',
'_relative_binding_escape_distance']
def _vxmatch_distance(insertSeq, seqDf, params):
"""Computes a vaccine insert distance for each breakthrough sequence
based on the specified "subst" similarity matrix.
Used by the global and local vxmatch sieve analyses.
NOTE:
Normalization is performed using a mean similarity across the sequence, but applied site-wise.
This is important for stability in the normalization, but may be under estimating the contribution of
rare amino-acids when non-binary and non-normalized similarity matrices are used (e.g. HIV PAM)
Parameters
----------
insertSeq : str
Amino acid sequence of the vaccine insert/immunogen
seqDf : pd.DataFrame
Contains the breakthrough sequences (see pysieve.data for details)
params : dict
Should contain a similarity matrix 'subst', a dict of amino acid similarity scores.
Returns
-------
dist : pd.DataFrame
Distance matrix [ptids x sites] with PTID row-index and sites (0-indexed) as columns."""
try:
subst = params['subst']
except KeyError:
print 'Using default binary substitution matrix.'
subst = addGapScores(binarySubst, None)
N = seqDf.shape[0]
nSites = len(insertSeq)
sim = np.ones((N, nSites))
"""Compute insert2insert similarity for normalization.
Similarity of insert to itself is high (exactly 1 for a hamming distance)"""
insert2insert = np.nanmean(_similarity_score(insertSeq, insertSeq, subst = subst))
for ptidi,ptid in enumerate(seqDf.index):
"""Similarity of a sequence to itself is also high and will be used in the normalization"""
seq2seq = np.nanmean(_similarity_score(seqDf.seq[ptid], seqDf.seq[ptid], subst = subst))
sim[ptidi,:] = _similarity_score(seqDf.seq[ptid], insertSeq, subst = subst, denominator = seq2seq + insert2insert)
"""similarity --> distance only works with correct normalization"""
dist = 1 - sim
return pd.DataFrame(dist, index = seqDf.index, columns = np.arange(nSites))
def _nepitope_distance(insertBA, btBA, params):
"""Creates a distance matrix (DataFrame) [N x sites]
indicating the number of PTIDs predicted to have an insert epitope for each kmer
Parameters
----------
insertBA : pd.DataFrame
Row index as HLAs and colums as sites, shape [len(uHLA4) x nSites]
btBA : dict
Dict contains keys for (1) "validHLAs" HLAs used, (2) "ba" array([nHLAs (4 typically), nSites]) and (3) "ptid"
params : dict
Should contain 'binding' and 'nmer' parameters.
Returns
-------
dist : pd.DataFrame
Distance matrix [ptids x sites] with PTID row-index and sites (kmer start positions 0-indexed) as columns."""
N = len(btBA['ptid'])
nSites = insertBA.shape[1]
dist = np.nan * np.ones((N,nSites))
for ptidi,ptid,ba,h in zip(np.arange(N), btBA['ptid'], btBA['ba'], btBA['validHLAs']):
tmpInsert = insertBA.loc[h]
"""Do not double count escapes from homozygous alleles"""
dummy,uniqi = np.unique(h, return_index = True)
"""Note that the original binding escape distance sums across HLAs not any's """
#dist[ptidi,:]=np.squeeze(np.any((tmpInsert<params['binding']),axis=0))
dist[ptidi,:] = np.squeeze(np.sum((tmpInsert < params['binding']).values[uniqi,:], axis = 0))
return pd.DataFrame(dist, index = btBA['ptid'], columns = np.arange(nSites))
def _indel_escape_distance(insertSeq, insertBA, seqDf, btBA, params):
"""Creates a distance matrix (DataFrame) [N x sites] with PTID rows and sites as columns
populated with the HLA binding escape count distance
Parameters
----------
insertSeq : str
Amino acid sequence of the vaccine insert/immunogen
insertBA : pd.DataFrame
Row index as HLAs and colums as sites, shape [len(uHLA4) x nSites]
seqDf : pd.DataFrame
Contains the breakthrough sequences (see pysieve.data for details)
btBA : dict
Dict contains keys for (1) "validHLAs" HLAs used, (2) "ba" array([nHLAs (4 typically), nSites]) and (3) "ptid"
params : dict
Should contain 'binding' and 'nmer' parameters.
Returns
-------
dist : pd.DataFrame
Distance matrix [ptids x sites] with PTID row-index and sites (kmer start positions 0-indexed) as columns."""
N = seqDf.shape[0]
nSites = insertBA.shape[1]
dist = np.nan * np.ones((N, nSites))
for ptidi,ptid in enumerate(seqDf.index):
"""An indel is 'shared' if its found in both the insert and breakthrough kmer"""
unsharedIndel = np.zeros((len(btBA['validHLAs'][ptidi]), nSites), dtype = bool)
for sitei in range(nSites):
"""grabKmer returns None if the kmer is invalid (e.g. off the end of the sequence)"""
insertMer,_nonGapped = grabKmer(insertSeq, sitei, params['nmer'])
btMer,_nonGapped = grabKmer(seqDf.seq[ptid], sitei, params['nmer'])
"""If the insert and the bt mer don't have gaps in the same places then there is an indel!"""
if not insertMer is None:
if btMer is None or not np.all(np.array(_findall(insertMer,'-')) == np.array(_findall(btMer,'-'))):
unsharedIndel[:,sitei] = True
tmpInsert = insertBA.loc[btBA['validHLAs'][ptidi]]
"""Do not double count escapes from homozygous alleles"""
dummy,uniqi = np.unique(btBA['validHLAs'][ptidi], return_index=True)
"""ANY HLA: Escape count is 1 if the kmer binds and there is an indel in the BT seq"""
#dist[ptidi,:] = np.squeeze(np.any((tmpInsert < params['binding']) & unsharedIndel, axis=0))
"""SUM HLAs: count 1 escape per HLA that binds (not including homozygous alleles"""
dist[ptidi,:] = np.squeeze(np.sum(((tmpInsert < params['binding']).values & unsharedIndel)[uniqi,:],axis=0))
return pd.DataFrame(dist, index = btBA['ptid'], columns = np.arange(nSites))
def _binding_escape_distance(insertSeq, insertBA, seqDf, btBA, params):
"""Creates a distance matrix (pd.DataFrame) [ptids x sites]
populated with the HLA binding escape count distance.
TODO:
- Handle 15mer 'unique core' distances
- Standardize the input arguments to match other distances
(or at least other T cell based distances)
- Make the handling of homozygous alleles and multiple escapes per kmer
a parameter.
Parameters
----------
insertSeq : str
Amino acid sequence of the vaccine insert/immunogen
insertBA : pd.DataFrame
Row index as HLAs and colums as sites, shape [len(uHLA4) x nSites]
seqDf : pd.DataFrame
Contains the breakthrough sequences (see pysieve.data for details)
btBA : dict
Dict contains keys for (1) "validHLAs" HLAs used, (2) "ba" array([nHLAs (4 typically), nSites]) and (3) "ptid"
params : dict
Should contain 'binding', 'escape' and 'nmer' parameters.
Returns
-------
dist : pd.DataFrame
Distance matrix [ptids x sites] with PTID row-index and sites (kmer start positions 0-indexed) as columns."""
N = len(btBA['ptid'])
nSites = insertBA.shape[1]
"""Don't count a binding escape if there's also an insertion/deletion there
(these distances should be mutually exclusive)
Import to make indelDist 0s and 1s to work for this purpose"""
indelDist = (_indel_escape_distance(insertSeq, insertBA, seqDf, btBA, params).values > 0).astype(int)
dist = np.nan * np.ones((N,nSites))
for ptidi,ptid,ba,h in zip(np.arange(N), btBA['ptid'], btBA['ba'], btBA['validHLAs']):
"""Maxtrix of binding affinities for the hla alleles in h"""
tmpInsert = insertBA.loc[h]
"""Do not double count escapes from homozygous alleles"""
dummy,uniqi = np.unique(h, return_index=True)
"""ANY HLA: For each HLA (typically 4 per PTID), if it meets the criteria for this kmer then its an escape"""
#dist[ptidi,:] = np.squeeze(np.any((tmpInsert<params['binding']) & (ba>params['escape']),axis=0)) * (1-indelDist[ptidi,:])
"""SUM HLAS: Count multiple escapes per kmer if the person has multiple alleles with escape"""
dist[ptidi,:] = np.squeeze(np.sum(((tmpInsert < params['binding']).values & (ba > params['escape']))[uniqi,:], axis=0)) * (1 - indelDist[ptidi,:])
return pd.DataFrame(dist, index=btBA['ptid'], columns=np.arange(nSites))
def _prepareSeqBA(seq, hlas, ba, k, ignoreGappedKmers=False, getFast=False):
"""Prepare a matrix of binding affinities for all kmers in seq and all hlas.
Parameters
----------
seqs : collection
Aligned sequences/strings
hlas : collection
HLA alleles
ba : dict/HLAPredCache
Contains all neccessary predicted binding affinities for propagating the matrix.
Keys are tuples (allele, peptide)
k : int
Length of the peptides.
ignoreGappedKmers : bool
If False then kmer continues for full k AA,
but if True then throws out all kmers with a gap at any location in any bt sequence.
getFast : bool
If True, uses the getFast method of hlaPredCache, w/o error checking
Returns
-------
baMat : pd.DataFrame, [nHLAs x nSites]
Matrix of binding affinities with rows HLAs and columns as 0-indexed kmer start positions."""
nSites = len(seq)
baMat = np.nan * np.ones((len(hlas),nSites))
if getFast:
baFunc = lambda t: ba.getFast(t)
"""Replace all '*' here just in case, if trying to use getFast"""
originalHLAs = hlas
hlas = [h.replace('*','_') for h in hlas]
else:
originalHLAs = hlas
baFunc = lambda t: ba[t]
for sitei in range(nSites):
"""ngmer is None if the insert kmer starts with a gap '-', leave these as nan"""
gmer,ngmer = grabKmer(seq,sitei,k)
if not ignoreGappedKmers:
"""Use ngmer which starts at sitei and grabs the next nmer AAs (not counting gaps)"""
mer = ngmer
else:
mer = gmer
if (not mer is None) and (not '-' in mer):
for hlai,hla in enumerate(hlas):
if isvalidHLA(hla):
baMat[hlai,sitei] = baFunc((hla,mer))
return pd.DataFrame(baMat, index = originalHLAs, columns = np.arange(nSites))
def _prepareAlignBA(seqs, hlas, ba, k, ignoreGappedKmers = False, getFast = False):
"""Prepare a matrix of binding affinities for all kmers, all HLAs and all seqs.
Parameters
----------
seqs : collection
Aligned sequences/strings
hlas : collection
HLA alleles
ba : dict/HLAPredCache
Contains all neccessary predicted binding affinities for propagating the matrix.
Keys are tuples (allele, peptide)
k : int
Length of the peptides.
ignoreGappedKmers : bool
If False then kmer continues for full k AA,
but if True then throws out all kmers with a gap at any location in any bt sequence.
getFast : bool
If True, uses the getFast method of hlaPredCache, w/o error checking
Returns
-------
baMat : ndarray [nSeqs x nHLAs x nSites]
Matrix of binding affinities"""
nSites = int(np.median([len(s) for s in seqs]))
baMat = np.nan * np.ones((len(seqs), len(hlas), nSites))
"""Go through each person, creating a bt BA [len(hla) x nSites] and assign to the big matrix"""
for seqi,seq in enumerate(seqs):
baMat[seqi,:,:] = _prepareSeqBA(seq, hlas, ba, k, ignoreGappedKmers, getFast).values
"""Ignore any kmer that had a gap in any BT sequence"""
if ignoreGappedKmers:
"""If the BA is nan for all HLAs then the kmer must have had a gap.
If aross people, any kmer had a gap then set all people nan there"""
badSites = np.any(np.all(np.isnan(baMat), axis=1), axis=0)
"""TODO: this could be simplified using numpy broadcasting"""
baMat[np.tile(badSites[None,None,:], (baMat.shape[0], baMat.shape[1],1))] = np.nan
return baMat
def _prepareBTBA(data, ba, params):
"""Prepare matrix of log-IC50 binding affinities of BREAKTHROUGH sequences given the dict/HLAPredCache ba
Only the BA for alleles expressed by each subject are returned.
Q:What calls this function and why?
Parameters
----------
data : pysieve.sieveData object
ba : dict/hlaPredCache
params : dict
Required keys: nmer, ignoreGappedKmers, getFast
Returns
-------
btBA : dict of lists with rows per ptid
Dict keys for (1) "validHLAs" HLAs used,
(2) "ba" array([nHLAs (4 typically), nSites])
(3) "ptid"
"""
nSites = len(data.insertSeq)
fullBTBA = _prepareAlignBA(data.seqDf.seq,
data.uHLA4,
ba,
params['nmer'],
ignoreGappedKmers = params['ignoreGappedKmers'],
getFast = params['getFast'])
"""Go through each person, creating a bt BA [len(hla) x nSites]"""
btBA = {'ptid':[],'validHLAs':[],'ba':[]}
for ptid,row in data.seqDf.iterrows():
ptidi = list(data.seqDf.index).index(ptid)
btSeq = row['seq']
HLAs = data.ptidDf.hla[ptid]
validHLAs = []
for hla in HLAs:
if isvalidHLA(hla):
validHLAs.append(hla)
"""New way using the full BTBA"""
tmpba = np.nan * np.ones((len(validHLAs),nSites))
for i,h in enumerate(validHLAs):
hlai = list(data.uHLA4).index(h)
tmpba[i,:]=fullBTBA[ptidi,hlai,:]
btBA['ptid'].append(ptid)
btBA['validHLAs'].append(validHLAs)
btBA['ba'].append(tmpba)
return btBA
def _prepareBA(data, ba, params):
"""Prepare matrices of log-IC50 HLA binding affinities for insert and breakthrough sequences
Used by several T-cell based sieve distances.
TODO:
Currently this function returns two different btBA data objects depending on fullBT
This is probably not a good idea.
Parameters
----------
data : pysieve.sieveData object
ba : dict/hlaPredCache
params : dict
Required keys: nmer, fullBT, ignoreGappedKmers, getFast
Returns
-------
insertBA : pd.DataFrame, shape [len(uHLA4) x nSites]
Row index as HLAs and colums as start positions
btBA : variable
Dict of lists with rows per ptid, dict keys for
(1) "validHLAs" HLAs used,
(2) "ba" array([nHLAs (4 typically), nSites])
(3) "ptid"
OR if fullBT is True, a 3D ndarray [nSeqs x nHLAs x nSites]
method : str
Describing the prediction method used (from ba)"""
fullBT = params.get('fullBT',False)
getFast = params.get('getFast',False)
ignoreGappedKmers = params.get('ignoreGappedKmers',False)
"""Create matrix of insert BA [len(uHLA4) x nSites]"""
insertBA = _prepareSeqBA(data.insertSeq,
data.uHLA4,
ba,
params['nmer'],
ignoreGappedKmers = params['ignoreGappedKmers'],
getFast = params['getFast'])
if not fullBT:
btBA = _prepareBTBA(data,ba,params)
else:
"""New structure required by _relative_binding_escape distance"""
btBA = _prepareAlignBA(data.seqDf.seq,
data.uHLA4,
ba,
params['nmer'],
ignoreGappedKmers = params['ignoreGappedKmers'],
getFast = params['getFast'])
return insertBA, btBA, ba.predictionMethod
def _similarity_score(seq1, seq2, subst, denominator = 2):
"""Return a vector of site-wise similarities between two sequences based on a substitution matrix (dict).
Optionally can give a denominator for normalization.
Example denominator: sim11 + sim22 which is the sum of seq1 to itself and seq2 to itself.
Denominator can be supplied as a vector, in which case the normalization is done site-wise or
as a scalar in which case it is equivalent to applying the normalization for the whole sequence
(even though this function will still give the distance site-wise)
By default there is no normalization (denominator = 2).
This can create a problem for vxmatch using similarity matrices other than binarySubst"""
"""Similarity between seq1 and seq2 using the substitution matrix subst"""
sim12 = np.array([i for i in itertools.imap(lambda a,b: subst.get((a,b), subst.get((b,a))), seq1, seq2)])
return (2 * sim12) / denominator
def _findall(s, item):
"""Return index of each element equal to item in s"""
return [i for i,x in enumerate(s) if x == item]
def _indel_scan_distance(insertSeq, insertBA, seqDf, btBA, nmer, paramPairs, minDelta):
"""Creates a distance matrix ndarray [N x sites x params]
populated with the indel escape count distance"""
N = seqDf.shape[0]
nSites = insertBA.shape[1]
nParams = len(paramPairs)
minDelta = Decimal('%1.1f' % minDelta)
dist = np.nan*np.ones((N,nSites,nParams))
for ptidi,ptid in enumerate(seqDf.index):
unsharedIndel = np.zeros((len(btBA['validHLAs'][ptidi]),nSites), dtype=bool)
"""Determine if there are any 'unshared' gaps in each kmer"""
for sitei in xrange(nSites):
insertMer,_nonGapped = grabKmer(insertSeq,sitei,nmer)
btMer,_nonGapped = grabKmer(seqDf.seq[ptid],sitei,nmer)
"""If the insert and the bt mer don't have gaps in the same places then there is an indel!"""
if not insertMer is None:
if btMer is None or not np.all(np.array(_findall(insertMer,'-')) == np.array(_findall(btMer,'-'))):
unsharedIndel[:,sitei] = True
tmpInsert = insertBA.loc[btBA['validHLAs'][ptidi]]
"""Do not double count escapes from homozygous alleles"""
dummy,uniqi = np.unique(btBA['validHLAs'][ptidi], return_index=True)
"""For each parameter pair compute the distance"""
for parami,pp in enumerate(paramPairs):
"""A pair of binding and escape thresholds are only valid if escape - binding > delta"""
if (pp[1]-pp[0]) > minDelta:
"""If any of the HLAs (typically 4 per PTID) meets the criteria for this kmer then its an escape"""
#dist[ptidi,:,parami]=np.squeeze(np.any((tmpInsert < pp[0]) & unsharedIndel,axis=0))
"""The original binding escape distance, for EACH of the HLAs that meets the criteria...its an escape"""
dist[ptidi,:,parami] = np.squeeze(np.sum(((tmpInsert<np.float64(pp[0])).values & unsharedIndel)[uniqi,:],axis=0))
return dist
def _binding_scan_distance(insertSeq, insertBA, seqDf, btBA, paramPairs, minDelta, nmer):
"""Creates a distance matrix ndarray [N x sites x params]
populated with the HLA binding escape count distance"""
N = len(btBA['ptid'])
nSites = insertBA.shape[1]
nParams = len(paramPairs)
minDelta = Decimal('%1.1f' % minDelta)
"""Don't count a binding escape if there's also an indel there (these distances should be mutually exclusive)
Import to dichotomize this distance when used here though"""
#indelDist = (_indel_scan_distance(insertSeq, insertBA, seqDf, btBA, nmer, paramPairs, minDelta) > 0).astype(np.int64)
"""Nan is default value!
This means that the following will be nan in dist:
non-existent insert kmers (ie start with '-') (ACTUALLY THESE SHOULD BE ZEROS)
invalid parameters (insufficient delta)
filteredDist will have nans at kmers that have been filtered out
it doesn't appear that there are other nans
This is important because it means in the compstats for binding escape i could set nan to 0
"""
dist = np.nan * np.ones((N, nSites, nParams))
dummydist = np.nan * np.ones((N, nSites, nParams))
for ptidi,ptid,ba,h in zip(np.arange(N), btBA['ptid'], btBA['ba'], btBA['validHLAs']):
tmpInsert = insertBA.loc[h]
"""Do not double count escapes from homozygous alleles"""
dummy,uniqi = np.unique(h, return_index=True)
for parami,pp in enumerate(paramPairs):
"""A pair of binding and escape thresholds are only valid if escape - binding > delta"""
if (pp[1]-pp[0]) > minDelta:
tmp=(tmpInsert < np.float64(pp[0])).values & (ba > np.float64(pp[1]))
"""Sum across HLAs"""
dist[ptidi,:,parami] = np.sum(tmp[uniqi,:], axis=0) #* (1-indelDist[ptidi,:,parami])
dummydist[ptidi,:,parami] = 1
return dist
def _epitope_mismatch_distance(seqDf, insertSeq, insertDf, insertBA, btBA, params):
"""Creates a distance matrix (DataFrame) [N x sites]
indicating the PTIDs with an insert epitope and a
breakthrough with greater than mmTolerance substitutions relative to the reference.
Parameters
----------
seqDf : pd.DataFrame
PTID as index with seq column containing breakthrough sequences
insertSeq : str
Amino acid sequence of the insert
insertBA : pd.DataFrame
Row index as HLAs and colums as sites, shape [len(uHLA4) x nSites]
btBA : dict
Dict contains keys for (1) "validHLAs" HLAs used, (2) "ba" array([nHLAs (4 typically), nSites]) and (3) "ptid"
params : dict
Should contain binding, nmer, ignoreGappedKmers, and mmTolerance parameters.
Returns
-------
dist : pd.DataFrame
Distance matrix [ptids x sites] with PTID row-index and sites (kmer start positions 0-indexed) as columns."""
N = len(btBA['ptid'])
nSites = insertBA.shape[1]
mmCount = np.zeros((N, nSites))
for sitei in range(nSites):
"""ngmer is None if the insert kmer starts with a gap '-', leave these as nan"""
igmer,ingmer = grabKmer(insertSeq, sitei, params['nmer'])
if not params['ignoreGappedKmers']:
"""Use ngmer which starts at sitei and grabs the next nmer AAs (not counting gaps)"""
imer = ingmer
else:
imer = igmer
for ptidi,ptid in zip(np.arange(N), btBA['ptid']):
btgmer,btngmer = grabKmer(seqDf.seq.loc[ptid], sitei, params['nmer'])
if not params['ignoreGappedKmers']:
"""Use ngmer which starts at sitei and grabs the next nmer AAs (not counting gaps)"""
btmer = btngmer
else:
btmer = btgmer
mmCount[ptidi,sitei] = hamming_distance(imer, btmer, asStrings=True)
dist = np.nan * np.ones((N, nSites))
for ptidi,ptid,ba,h in zip(np.arange(N), btBA['ptid'], btBA['ba'], btBA['validHLAs']):
tmpInsert = insertBA.loc[h]
"""Do not double count escapes from homozygous alleles"""
dummy,uniqi = np.unique(h, return_index=True)
"""ANY HLA allele binds AND mmCount is greater than mmTolerance"""
dist[ptidi,:] = np.squeeze(np.any((tmpInsert < params['binding']).values[uniqi,:], axis=0))
dist[ptidi,:] = dist[ptidi,:] & (mmCount[ptidi,:] > params['mmTolerance'])
return pd.DataFrame(dist.astype(np.float64), index=btBA['ptid'], columns=np.arange(nSites))
def _identifyNoPressureBT(insertBA, hlaMatrix, params):
"""Identify the bt kmers that aren't under pressure from potential vaccine epitopes (the person doesn't have any binding HLAs for the kmer location)
Returns a numpy array [N x 1 x nSites]
insertBA: [nHLAs x nSites] pd.DataFrame
hlaMatrix: boolean matrix of HLA expression [ptid x nHLAs]
params: binding"""
N = hlaMatrix.shape[0]
nSites = insertBA.shape[1]
"""Identify the binding HLAs at each site: [nHLAs x nSites] bool"""
bindingHLAs = insertBA < params['binding']
"""Identify the breakthrough kmers (BA) associated with people lacking a binding allele at each site"""
"""[ptid x hla x site]"""
tmpHLA = np.tile(hlaMatrix.values[:,:,None],(1,1,nSites))
tmpBinding = np.tile(bindingHLAs.values[None,:,:],(N,1,1))
"""Use this to index into btBA to pull out the BA for kmers that were not under pressure by the HLA (e.g. btBA[:,hlai,:][noPressureBT])"""
"""[ptid x 1 x sites]"""
"""For each bt kmer, for it not to be under pressure,
it must either not be an insert binder or not be an HLA allele that the person expresses, across all HLAs"""
noPressureBT = np.all(~tmpBinding | ~tmpHLA, axis=1)[:,None,:]
"""If any sites don't have any bt kmers that are not under pressure then raise an exception"""
if np.any(np.squeeze(noPressureBT.sum(axis=0)) == 0):
raise Exception("Can't compute escape threshold for all kmers!")
return noPressureBT
def _relative_binding_escape_distance(insertSeq,insertBA,seqDf,btBA,hlaMatrix,params,lookupDf):
"""Creates a distance matrix (DataFrame) [N x sites] with PTID rows and sites as columns
populated with the HLA binding escape count distance
This is "Allan's distance" and it differs from the binding_escape distance
only in how the escape threshold is determined. The escape threshold is the
median of the BA of the non-binding HLAs with a given BT peptide
insertSeq: AA str
insertBA: [nHLAs x nSites] pd.DataFrame
seqDf: DataFrame with ptid rows and column seq containing BT seqs
btBA: FULL btBA matrix [nSeqs x nHLAs x nSites] ndarray (neccessary for this method)
hlaMatrix: boolean matrix of HLA expression [ptid x nHLAs]
params: binding"""
N = btBA.shape[0]
nSites = insertBA.shape[1]
"""Don't count a binding escape if there's also an indel there (these distances should be mutually exclusive)
Import to make indelDist 0s and 1s to work for this purpose"""
#indelDist=(_indel_escape_distance(insertSeq,insertBA,seqDf,btBA,params).values > 0).astype(int64)
"""Identify the breakthrough kmers (BA) associated with people lacking a binding allele at each site"""
noPressureBT = _identifyNoPressureBT(insertBA, hlaMatrix, params)
"""with open(DATA_PATH + 'STEP/andrew_rel_binding_escape.csv','w') as fh:
print >> fh, 'position,seqid,ptid,hla,insert_peptide,bt_peptide,rbe,be,cutoff,ndiff,y,BET'"""
dist = np.nan * np.ones((N,nSites))
for ptidi,ptid in enumerate(seqDf.index):
"""Slice insertBA so that it only contains the valid HLAs of the current ptid"""
validHLAs = [h for h in hlaMatrix.columns[hlaMatrix.ix[ptid]] if isvalidHLA(h)]
validHLAInd = np.array([h in validHLAs for h in hlaMatrix.columns])
"""[4 x nSites]"""
tmpInsert = insertBA.ix[validHLAs].values
"""Do not double count escapes from homozygous alleles"""
uValidHLAs,uniqi = np.unique(validHLAs, return_index=True)
"""For each HLA (typically 4 per PTID), if it meets the criteria for this kmer then its an escape"""
"""[4 x nSites]"""
insertBinders = (tmpInsert < params['binding'])
"""This is a complicated step:
(1) Pull out the BA for the HLAs associated with this ptid [N x 4 x nSites]
(2) Tile the noPressureBT index matrix along the hla axis [N x 4 nSites]
(3) Index (1) using (2) to yield a matrix of the btBAs with the current HLAs (4) at the sequences of people that had no binding HLA
[nonBinders x 4 x nSites]
(4) Take the median across nonBinders
[4 x nSites]
"""
"""Set all btBA that are under pressure from an HLA to nan prior to taking the median across bt kmers"""
tmpNoPressureInd = np.tile(noPressureBT, (1,len(validHLAs),1))
tmpBtBA = btBA[:,validHLAInd,:]
tmpBtBA[~tmpNoPressureInd] = np.nan
"""tauThresholds [4 x nSites]"""
tauThresholds = np.nanmedian(tmpBtBA, axis=0)
"""nonBTBinders [4 x nSites]"""
nonBTBinders = (btBA[ptidi,validHLAInd,:] > tauThresholds) & (tmpInsert < btBA[ptidi,validHLAInd,:])
escapes = (insertBinders & nonBTBinders)[uniqi,:]
dist[ptidi,:] = np.squeeze(np.sum(escapes, axis=0))# * (1-indelDist[ptidi,:])
"""for hi in arange(escapes.shape[0]):
hla=uValidHLAs[hi]
hlai=insertBA.index==hla
seqid=lookupDf.index[lookupDf.ptid==ptid][0]
for kmeri in arange(escapes.shape[1]):
gapped,insertKmer=grabKmer(insertSeq,kmeri,k=params['nmer'])
gapped,btKmer=grabKmer(seqDf.seq[ptid],kmeri,k=params['nmer'])
try:
hdist=hamming_distance(insertKmer,btKmer)
except TypeError:
hdist=nan
if seqid=='5021709' and kmeri==10 and hla=='A_2301':
continue
raise Exception()
#print >> fh, 'position,seqid,ptid,hla,insert_peptide,bt_peptide,rbe,be,cutoff,ndiff,y,BET'
print >> fh, '%d,%s,%s,%s,%s,%s,%1.2f,%1.2f,%1.2f,%1.0f,%d,%1.1f' % (kmeri+1,
seqid,ptid,hla,insertKmer,btKmer,insertBA[kmeri][hlai],btBA[ptidi,hlai,kmeri],
tauThresholds[uniqi,:][hi,kmeri],hdist,escapes[hi,kmeri],params['binding'])"""
return pd.DataFrame(dist, index=seqDf.index, columns=insertBA.columns)