-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDISTANT_ADMIXTURE_MODELS.py
445 lines (333 loc) · 18.6 KB
/
DISTANT_ADMIXTURE_MODELS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
DISTANT_ADMIXTURE_MODELS
Given reads that originated form the same genomic window and a reference panel
of two or more populations, the likelihood of observed reads under four scenarios,
namely, monosomy, disomy, SPH and BPH is calculated. This module is for distant
admixtures, where each descendant haplotype has a certain probability to
originate from one of two ancestral populations.
BPH (Both Parental Homologs) correspond to the presence of three unmatched
haplotypes, while SPH (Single Parental Homolog) correspond to chromosome gains
involving identical homologs.
Daniel Ariad (daniel@ariad.org)
Aug 10, 2021
"""
import pickle, os, sys, bz2, collections, gzip, platform
from functools import reduce
from operator import and_, itemgetter
from itertools import combinations
leg_tuple = collections.namedtuple('leg_tuple', ('chr_id', 'pos', 'ref', 'alt')) #Encodes the rows of the legend table
obs_tuple = collections.namedtuple('obs_tuple', ('pos', 'read_id', 'base')) #Encodes the rows of the observations table
likelihoods_tuple = collections.namedtuple('likelihoods_tuple', ('monosomy', 'disomy', 'SPH', 'BPH')) #Encodes the likelihoods for four scenarios, namely, monosomy, disomy, SPH and BPH.
### Getting a function to count non-zero bits in positive integer.
class popcount_lk:
""" Creates an instance for calculating the population count of
bitstring, based on a lookup table of 8 bits. """
def __init__(self):
""" Creates a large lookup table of the Hamming weight of every 8 bit integer. """
self.lookup_table = bytes.maketrans(bytes(range(1<<8)),bytes((bin(i).count('1') for i in range(1<<8))))
self.byteorder = sys.byteorder
def __call__(self,x):
""" Breaks x, which is a python integer type, into chuncks of 8 bits.
Calls the lookup table to get the population count of each chunck and returns
the aggregated population count. """
return sum(x.to_bytes((x.bit_length()>>3)+1,self.byteorder).translate(self.lookup_table))
class distant_admixture:
""" Based on the statisitcal models for distant admixtures and a reference
panel, it allows to calculate the likelihoods of observed alleles under
various statistical models (monosomy, disomy, SPH and BPH). """
def __init__(self, obs_tab, leg_tab, hap_tab_per_group, number_of_haplotypes_per_group, models_dict, ancestral_makeup):
""" Initialize the attributes of the class. """
print('Initializing the algorithm for distant admixtures:')
if not all(len(leg_tab)==len(hap_tab) for hap_tab in hap_tab_per_group.values()):
raise Exception('Error: the number of SNPs in the LEGEND file differ from the number of SNPs in the HAP file.')
if sum(ancestral_makeup.values())<0.999:
raise Exception('Error: ancestry proportions must sum to one.')
self.models_dict = models_dict
self.number_of_haplotypes_per_group = number_of_haplotypes_per_group
self.ancestral_makeup = ancestral_makeup #Keys correspond to group2 names and values to haplotype proportions.
for group2 in ancestral_makeup:
if group2 not in hap_tab_per_group:
raise Exception('Error: %s is missing from the reference panel.' % group2)
self.hap_dict_per_group = {}
self.fraction_of_matches = {}
for group2, hap_tab in hap_tab_per_group.items():
self.hap_dict_per_group[group2], self.fraction_of_matches[group2] = \
self.build_hap_dict(obs_tab, leg_tab, hap_tab, number_of_haplotypes_per_group[group2])
print('%.2f%% of the observed alleles matched the reference panel of %s.' % (100*self.fraction_of_matches[group2], group2))
def build_hap_dict(self, obs_tab, leg_tab, hap_tab, number_of_haplotypes):
""" Returns a dictionary that lists SNP alleles and gives their
relevent row from haplotypes table. The row is stored as bits, where
True means that the haplotype contains the allele. We denote the
returned dictionary as the reference panel. """
hap_dict = dict()
mismatches = 0
combined = {pos: (ref,alt,hap) for (chr_id,pos,ref,alt),hap in zip(leg_tab, hap_tab)}
missing = 3*(None,)
b = (1 << number_of_haplotypes) - 1 #### equivalent to int('1'*number_of_haplotypes,2)
for (pos, read_id, base) in obs_tab:
ref, alt, hap = combined.get(pos, missing)
if base==alt:
hap_dict[(pos,base)] = hap
elif base==ref:
hap_dict[(pos,base)] = hap ^ b ### ^b flips all bits of the binary number, hap_tab[ind] using bitwise xor operator.
else:
mismatches += 1
fraction_of_matches = 1-mismatches/len(obs_tab)
return hap_dict, fraction_of_matches
def build_intrenal_hap_dict(self, alleles, group2):
""" This function allows treatment of alleles and haplotypes on an
equal footing. This is done in three steps: (1) All the alleles and
haplotypes are enumerated. (2) For each given haplotype, tuples in the
reference panel, associated with the haplotype's alleles, are
intersected. (3) A dictionary that lists all the alleles and haplotypes
by their index is returned. The dictionary gives for each allele and
haplotype their associated tuple and intersected tuple, respectively. """
hap_dict = self.hap_dict_per_group[group2]
internal = {}
for i, haplotype in enumerate(alleles):
if type(haplotype[0])==tuple: #Checks if X is a tuple/list of alleles.
n = len(haplotype)
if n==1:
internal[1 << i] = hap_dict[haplotype[0]]
elif n==2:
internal[1 << i] = hap_dict[haplotype[0]] & hap_dict[haplotype[1]]
else:
internal[1 << i] = reduce(and_,itemgetter(*haplotype)(hap_dict))
elif type(haplotype[0])==int: #Checks if X is a single allele.
internal[1 << i] = hap_dict[haplotype]
else:
raise Exception('error: joint_frequencies only accepts alleles and tuple/list of alleles.')
return internal
def effective_joint_frequency(self, hap):
""" The probability to draw a haplotype from one of the groups2. """
return sum(proportion * popcount(hap[group2]) / self.number_of_haplotypes_per_group[group2]
for group2, proportion in self.ancestral_makeup.items())
def joint_frequencies_combo(self, alleles):
""" Based on the reference panel, it calculates joint frequencies of
observed alleles. The function arguments are alleles, that is,
tuples of position and base, e.g., (100,'T'), (123, 'A') and
(386, 'C'). Each allele is enumerated according to the order it
was received by the function. The function returns a dictionary that
lists all the possible subgroups of the given alleles. Each key in
the dictionary is a tuple of intergers that are lexicographically
sorted. Moreover, each integer within the keys corresponds to an
enumerated allele. For each subgroup of alleles the dictionary
gives the joint frequencies in a given population. The function
arguments can also include haplotypes, that is, tuples of alleles;
Haplotypes are treated in the same manner as alleles. """
representations = [1 << i for i in range(len(alleles))]
intrenal_hap_dict_per_group = {group2: self.build_intrenal_hap_dict(alleles, group2)
for group2 in self.hap_dict_per_group}
result = {}
for c in representations:
hap = {group2: internal[c] for group2, internal in intrenal_hap_dict_per_group.items()}
result[c] = self.effective_joint_frequency(hap)
for C in combinations(representations, 2):
hap = {group2: internal[C[0]] & internal[C[1]] for group2, internal in intrenal_hap_dict_per_group.items()}
result[C[0]|C[1]] = self.effective_joint_frequency(hap)
for C in combinations(representations, 3):
hap = {group2: internal[C[0]] & internal[C[1]] & internal[C[2]]
for group2, internal in intrenal_hap_dict_per_group.items()}
result[C[0]|C[1]|C[2]] = self.effective_joint_frequency(hap)
for r in range(4,len(alleles)):
for C in combinations(representations, r):
hap = {group2: reduce(and_,itemgetter(*C)(internal))
for group2, internal in intrenal_hap_dict_per_group.items()}
result[sum(C)] = self.effective_joint_frequency(hap)
if len(alleles)>=4:
hap = {group2: reduce(and_,internal.values())
for group2, internal in intrenal_hap_dict_per_group.items()}
result[sum(representations)] = self.effective_joint_frequency(hap)
return result
def joint_frequencies_redundant(self, alleles):
""" Performs the same task as the function joint_frequencies_combo, but
sacrifice the efficency in favor of simplicity. """
internal = {}
for group2, hap_dict in self.hap_dict_per_group.items():
internal[group2] = {1 << i: reduce(and_,itemgetter(*haplotype)(hap_dict))
if type(haplotype[0])==tuple else hap_dict[haplotype]
for i, haplotype in enumerate(alleles)}
representations = [1 << i for i in range(len(alleles))]
result = {}
for c in representations:
hap = {group2: internal[group2][c] for group2 in internal}
result[c] = self.effective_joint_frequency(hap)
for r in range(1,len(alleles)):
for C in combinations(representations, r+1):
hap = {group2: reduce(and_,itemgetter(*C)(internal[group2]))
for group2 in internal}
result[sum(C)] = self.effective_joint_frequency(hap)
return result
def likelihoods(self, alleles):
""" Calculates the likelihood to observe a set with alleles
and haplotypes under four scenarios, namely, monosomy, disomy, SPH
and BPH. """
models = self.models_dict[len(alleles)]
F = self.joint_frequencies_combo(alleles)
### BPH ###
(((A0, A1),((B0,),)),) = models['BPH'][1].items()
BPH = (A0 / A1) * F[B0]
BPH += sum( sum(F[B0] * F[B1] for (B0, B1) in C) * A0 / A1
for (A0, A1), C in models['BPH'][2].items())
if len(alleles)>2:
BPH += sum( sum(F[B0] * sum( F[B1] * F[B2] for (B1, B2) in C[B0]) for B0 in C) * A0 / A1
for (A0, A1), C in models['BPH'][3].items())
### SPH ###
(((A0, A1),((B0,),)),) = models['SPH'][1].items()
SPH = (A0 / A1) * F[B0]
SPH += sum( sum(F[B0] * F[B1] for (B0, B1) in C) * A0 / A1
for (A0, A1), C in models['SPH'][2].items())
### DIPLOIDY ###
(((A0, A1),((B0,),)),) = models['DISOMY'][1].items()
DISOMY = (A0 / A1) * F[B0]
DISOMY += sum( sum( F[B0] * F[B1] for (B0, B1) in C) * A0 / A1
for (A0, A1), C in models['DISOMY'][2].items())
### MONOSOMY ###
((B0,),) = models['MONOSOMY'][1][(1,1)]
MONOSOMY = F[B0]
result = likelihoods_tuple(MONOSOMY, DISOMY, SPH, BPH)
return result
def likelihoods2(self, alleles):
""" Calculates the likelihood to observe two alleles/haplotypes
under four scenarios, namely, monosomy, disomy, SPH and BPH. """
F = self.joint_frequencies_combo(alleles)
a, b, ab = F[1] , F[2], F[3] #The probability of a haplotype to be associated with group2_id=0 is m.
BPH = (ab+2*a*b)/3 #The likelihood of three unmatched haplotypes.
SPH = (5*ab+4*a*b)/9 #The likelihood of two identical haplotypes out three.
DISOMY = (ab+a*b)/2 #The likelihood of diploidy.
MONOSOMY = ab #The likelihood of monosomy.
return likelihoods_tuple(MONOSOMY, DISOMY, SPH, BPH)
def likelihoods3(self, alleles):
""" Calculates the likelihood to observe three alleles/haplotypes
under four scenarios, namely, monosomy, disomy, SPH and BPH. """
F = self.joint_frequencies_combo(alleles)
a, b, ab, c, ac, bc, abc = (F[i] for i in range(1,8))
BPH = (abc+2*(ab*c+ac*b+bc*a+a*b*c))/9 #The likelihood of three unmatched haplotypes.
SPH = abc/3+2*(ab*c+ac*b+bc*a)/9 #The likelihood of two identical haplotypes out three.
DISOMY = (abc+ab*c+ac*b+bc*a)/4 #The likelihood of diploidy.
MONOSOMY = abc #The likelihood of monosomy.
return likelihoods_tuple(MONOSOMY, DISOMY, SPH, BPH)
def likelihoods4(self, alleles):
""" Calculates the likelihood to observe four alleles/haplotypes
under four scenarios, namely, monosomy, disomy, SPH and BPH. """
F = self.joint_frequencies_combo(alleles)
a, b, ab, c, ac, bc, abc, d, ad, bd, abd, cd, acd, bcd, abcd = (F[i] for i in range(1,16))
BPH = (abcd+2*(ab*c*d+a*bd*c+a*bc*d+ac*b*d+a*b*cd+ad*b*c+abc*d+a*bcd+acd*b+abd*c+ab*cd+ad*bc+ac*bd))/27 #The likelihood of three unmatched haplotypes.
SPH = (17*abcd+10*(abc*d+bcd*a+acd*b+abd*c)+8*(ab*cd+ad*bc+ac*bd))/81 #The likelihood of two identical haplotypes out three.
DISOMY = (abcd+abc*d+bcd*a+acd*b+abd*c+ab*cd+ad*bc+ac*bd)/8 #The likelihood of diploidy.
MONOSOMY = abcd #The likelihood of monosomy.
return likelihoods_tuple(MONOSOMY, DISOMY, SPH, BPH)
def get_likelihoods(self, alleles):
""" Uses the optimal function to calculate the likelihoods.
In general, self.likelihoods can get more than five alleles but the
dedicated functions are optimized to a certain number of alleles. """
l = len(alleles)
if l==2:
result = self.likelihoods2(alleles)
elif l==3:
result = self.likelihoods3(alleles)
elif l==4:
result = self.likelihoods4(alleles)
elif l==5:
result = self.likelihoods5(alleles)
else:
result = self.likelihoods(alleles)
return result
def wrapper_of_distant_admixture_for_debugging(obs_filename,leg_filename,hap_filename,models_filename,admixture):
""" Wrapper function of the class 'distant_admixture'. It receives an
observations file, legend file, haplotypes file, samples file and a file
with the statistical models. Based on the given data it creates and returns
an instance of the class. """
if not os.path.isfile(obs_filename): raise Exception('Error: OBS file does not exist.')
if not os.path.isfile(leg_filename): raise Exception('Error: LEGEND file does not exist.')
if not os.path.isfile(hap_filename): raise Exception('Error: HAP file does not exist.')
if not os.path.isfile(models_filename): raise Exception('Error: MODELS file does not exist.')
load = lambda filename: {'bz2': bz2.open, 'gz': gzip.open}.get(filename.rsplit('.',1)[1], open) #Adjusts the opening method according to the file extension.
open_hap = load(hap_filename)
with open_hap(hap_filename,'rb') as hap_in:
hap_tab_per_group = pickle.load(hap_in)
number_of_haplotypes_per_group = pickle.load(hap_in)
open_leg = load(leg_filename)
with open_leg(leg_filename,'rb') as leg_in:
leg_tab = pickle.load(leg_in)
open_obs = load(obs_filename)
with open_obs(obs_filename, 'rb') as obs_in:
obs_tab = pickle.load(obs_in)
#info = pickle.load(f)
open_model = load(models_filename)
with open_model(models_filename, 'rb') as model_in:
models_dict = pickle.load(model_in)
###return obs_tab, leg_tab, hap_tab_per_group, number_of_haplotypes_per_group, admixture, models_dict
return distant_admixture(obs_tab, leg_tab, hap_tab_per_group, number_of_haplotypes_per_group, models_dict, admixture)
if platform.python_implementation()=='PyPy':
popcount = popcount_lk()
else:
try:
from gmpy2 import popcount
except Exception as error_msg:
print(error_msg)
popcount = popcount_lk()
if __name__ != "__main__":
print('The module DISTANT_ADMIXTURE_MODELS was imported.')
else:
print('The module DISTANT_ADMIXTURE_MODELS was invoked directly')
sys.exit(0)
############################### END OF FILE ###############################
"""
if __name__ != "__main__":
print("The module DISTANT_ADMIXTURE_MODELS was imported.")
else:
print('The module DISTANT_ADMIXTURE_MODELS was invoked directly')
#sys.exit(0)
import time, random
t0 = time.time()
obs_filename = 'test/test.obs.p'
hap_filename = 'test/test.hap.p'
leg_filename = 'test/test.leg.p'
models_filename = 'MODELS/MODELS16.p'
admixture = {'group0': 0.5, 'group1': 0.5}
A = wrapper_of_distant_admixture_for_debugging(obs_filename,leg_filename,hap_filename,models_filename,admixture)
alleles = tuple(A.hap_dict_per_group['group0'].keys())
frequencies = lambda *x: {bin(a)[2:]:b for a,b in A.joint_frequencies_combo(*x).items()}
frequencies_redundant = lambda *x: {bin(a)[2:]:b for a,b in A.joint_frequencies_redundant(*x).items()}
likelihoods = A.likelihoods
likelihoods2 = A.likelihoods2
likelihoods3 = A.likelihoods3
likelihoods4 = A.likelihoods4
random.seed(a=2022, version=2)
x = random.randrange(len(alleles)-16)
haplotypes = (alleles[x:x+4],alleles[x+4:x+8],alleles[x+8:x+12],alleles[x+12:x+16])
print('-----joint_frequencies_combo-----')
print(frequencies(alleles[x+0:x+1]))
print(frequencies_redundant(alleles[x+0:x+1]))
print(frequencies(alleles[x:x+4]))
print(frequencies_redundant(alleles[x:x+4]))
print('-----likelihoods4-haplotypes-----')
print(haplotypes)
print(frequencies(haplotypes))
print(frequencies_redundant(haplotypes))
print(likelihoods(haplotypes))
print(likelihoods4(haplotypes))
print('-----likelihoods2-----')
print(alleles[x:x+2])
print(frequencies(alleles[x:x+2]))
print(frequencies_redundant(alleles[x:x+2]))
print(likelihoods(alleles[x:x+2]))
print(likelihoods2(alleles[x:x+2]))
print('-----likelihoods3-----')
print(alleles[x:x+3])
print(frequencies(alleles[x:x+3]))
print(frequencies_redundant(alleles[x:x+3]))
print(likelihoods(alleles[x:x+3]))
print(likelihoods3(alleles[x:x+3]))
print('-----likelihoods4-----')
print(alleles[x:x+4])
print(frequencies(alleles[x:x+3]))
print(frequencies_redundant(alleles[x:x+3]))
print(likelihoods(alleles[x:x+4]))
print(likelihoods4(alleles[x:x+4]))
t1 = time.time()
print('Done in %.3f sec.' % ((t1-t0)))
"""