forked from bulik/ldsc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ldsc.py
executable file
·642 lines (567 loc) · 29 KB
/
ldsc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
#!/usr/bin/env python
'''
(c) 2014 Brendan Bulik-Sullivan and Hilary Finucane
LDSC is a command line tool for estimating
1. LD Score
2. heritability / partitioned heritability
3. genetic covariance / correlation
'''
from __future__ import division
import ldscore.ldscore as ld
import ldscore.parse as ps
import ldscore.sumstats as sumstats
import ldscore.regressions as reg
import numpy as np
import pandas as pd
from subprocess import call
from itertools import product
import time, sys, traceback, argparse
try:
x = pd.DataFrame({'A': [1, 2, 3]})
x.drop_duplicates(subset='A')
except TypeError:
raise ImportError('LDSC requires pandas version > 0.15.2')
__version__ = '1.0.0'
MASTHEAD = "*********************************************************************\n"
MASTHEAD += "* LD Score Regression (LDSC)\n"
MASTHEAD += "* Version {V}\n".format(V=__version__)
MASTHEAD += "* (C) 2014-2015 Brendan Bulik-Sullivan and Hilary Finucane\n"
MASTHEAD += "* Broad Institute of MIT and Harvard / MIT Department of Mathematics\n"
MASTHEAD += "* GNU General Public License v3\n"
MASTHEAD += "*********************************************************************\n"
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('precision', 4)
pd.set_option('max_colwidth',1000)
np.set_printoptions(linewidth=1000)
np.set_printoptions(precision=4)
def sec_to_str(t):
'''Convert seconds to days:hours:minutes:seconds'''
[d, h, m, s, n] = reduce(lambda ll, b : divmod(ll[0], b) + ll[1:], [(t, 1), 60, 60, 24])
f = ''
if d > 0:
f += '{D}d:'.format(D=d)
if h > 0:
f += '{H}h:'.format(H=h)
if m > 0:
f += '{M}m:'.format(M=m)
f += '{S}s'.format(S=s)
return f
def _remove_dtype(x):
'''Removes dtype: float64 and dtype: int64 from pandas printouts'''
x = str(x)
x = x.replace('\ndtype: int64', '')
x = x.replace('\ndtype: float64', '')
return x
class Logger(object):
'''
Lightweight logging.
TODO: replace with logging module
'''
def __init__(self, fh):
self.log_fh = open(fh, 'wb')
def log(self, msg):
'''
Print to log file and stdout with a single command.
'''
print >>self.log_fh, msg
print msg
def __filter__(fname, noun, verb, merge_obj):
merged_list = None
if fname:
f = lambda x,n: x.format(noun=noun, verb=verb, fname=fname, num=n)
x = ps.FilterFile(fname)
c = 'Read list of {num} {noun} to {verb} from {fname}'
print f(c, len(x.IDList))
merged_list = merge_obj.loj(x.IDList)
len_merged_list = len(merged_list)
if len_merged_list > 0:
c = 'After merging, {num} {noun} remain'
print f(c, len_merged_list)
else:
error_msg = 'No {noun} retained for analysis'
raise ValueError(f(error_msg, 0))
return merged_list
def annot_sort_key(s):
'''For use with --cts-bin. Fixes weird pandas crosstab column order.'''
if type(s) == tuple:
s = [x.split('_')[0] for x in s]
s = map(lambda x: float(x) if x != 'min' else -float('inf'), s)
else: # type(s) = str:
s = s.split('_')[0]
if s == 'min':
s = float('-inf')
else:
s = float(s)
return s
def ldscore(args, log):
'''
Wrapper function for estimating l1, l1^2, l2 and l4 (+ optionally standard errors) from
reference panel genotypes.
Annot format is
chr snp bp cm <annotations>
'''
if args.bfile:
snp_file, snp_obj = args.bfile+'.bim', ps.PlinkBIMFile
ind_file, ind_obj = args.bfile+'.fam', ps.PlinkFAMFile
array_file, array_obj = args.bfile+'.bed', ld.PlinkBEDFile
# read bim/snp
array_snps = snp_obj(snp_file)
m = len(array_snps.IDList)
log.log('Read list of {m} SNPs from {f}'.format(m=m, f=snp_file))
if args.annot is not None: # read --annot
try:
annot = ps.AnnotFile(args.annot)
n_annot, ma = len(annot.df.columns) - 4, len(annot.df)
log.log("Read {A} annotations for {M} SNPs from {f}".format(f=args.annot,
A=n_annot, M=ma))
annot_matrix = np.array(annot.df.iloc[:,4:])
annot_colnames = annot.df.columns[4:]
keep_snps = None
if np.any(annot.df.SNP.values != array_snps.df.SNP.values):
raise ValueError('The .annot file must contain the same SNPs in the same'+\
' order as the .bim file.')
except Exception:
log.log('Error parsing .annot file')
raise
elif args.extract is not None: # --extract
keep_snps = __filter__(args.extract, 'SNPs', 'include', array_snps)
annot_matrix, annot_colnames, n_annot = None, None, 1
elif args.cts_bin is not None and args.cts_breaks is not None: # --cts-bin
cts_fnames = sumstats._splitp(args.cts_bin) # read filenames
args.cts_breaks = args.cts_breaks.replace('N','-') # replace N with negative sign
try: # split on x
breaks = [[float(x) for x in y.split(',')] for y in args.cts_breaks.split('x')]
except ValueError as e:
raise ValueError('--cts-breaks must be a comma-separated list of numbers: '
+str(e.args))
if len(breaks) != len(cts_fnames):
raise ValueError('Need to specify one set of breaks for each file in --cts-bin.')
if args.cts_names:
cts_colnames = [str(x) for x in args.cts_names.split(',')]
if len(cts_colnames) != len(cts_fnames):
msg = 'Must specify either no --cts-names or one value for each file in --cts-bin.'
raise ValueError(msg)
else:
cts_colnames = ['ANNOT'+str(i) for i in xrange(len(cts_fnames))]
log.log('Reading numbers with which to bin SNPs from {F}'.format(F=args.cts_bin))
cts_levs = []
full_labs = []
for i,fh in enumerate(cts_fnames):
vec = ps.read_cts(cts_fnames[i], array_snps.df.SNP.values)
max_cts = np.max(vec)
min_cts = np.min(vec)
cut_breaks = list(breaks[i])
name_breaks = list(cut_breaks)
if np.all(cut_breaks >= max_cts) or np.all(cut_breaks <= min_cts):
raise ValueError('All breaks lie outside the range of the cts variable.')
if np.all(cut_breaks <= max_cts):
name_breaks.append(max_cts)
cut_breaks.append(max_cts+1)
if np.all(cut_breaks >= min_cts):
name_breaks.append(min_cts)
cut_breaks.append(min_cts-1)
name_breaks.sort()
cut_breaks.sort()
n_breaks = len(cut_breaks)
# so that col names are consistent across chromosomes with different max vals
name_breaks[0] = 'min'
name_breaks[-1] = 'max'
name_breaks = [str(x) for x in name_breaks]
labs = [name_breaks[i]+'_'+name_breaks[i+1] for i in xrange(n_breaks-1)]
cut_vec = pd.Series(pd.cut(vec, bins=cut_breaks, labels=labs))
cts_levs.append(cut_vec)
full_labs.append(labs)
annot_matrix = pd.concat(cts_levs, axis=1)
annot_matrix.columns = cts_colnames
# crosstab -- for now we keep empty columns
annot_matrix = pd.crosstab(annot_matrix.index,
[annot_matrix[i] for i in annot_matrix.columns], dropna=False,
colnames=annot_matrix.columns)
# add missing columns
if len(cts_colnames) > 1:
for x in product(*full_labs):
if x not in annot_matrix.columns:
annot_matrix[x] = 0
else:
for x in full_labs[0]:
if x not in annot_matrix.columns:
annot_matrix[x] = 0
annot_matrix = annot_matrix[sorted(annot_matrix.columns, key=annot_sort_key)]
if len(cts_colnames) > 1:
# flatten multi-index
annot_colnames = ['_'.join([cts_colnames[i]+'_'+b for i,b in enumerate(c)])
for c in annot_matrix.columns]
else:
annot_colnames = [cts_colnames[0]+'_'+b for b in annot_matrix.columns]
annot_matrix = np.matrix(annot_matrix)
keep_snps = None
n_annot = len(annot_colnames)
if np.any(np.sum(annot_matrix, axis=1) == 0):
# This exception should never be raised. For debugging only.
raise ValueError('Some SNPs have no annotation in --cts-bin. This is a bug!')
else:
annot_matrix, annot_colnames, keep_snps = None, None, None,
n_annot = 1
# read fam
array_indivs = ind_obj(ind_file)
n = len(array_indivs.IDList)
log.log('Read list of {n} individuals from {f}'.format(n=n, f=ind_file))
# read keep_indivs
if args.keep:
keep_indivs = __filter__(args.keep, 'individuals', 'include', array_indivs)
else:
keep_indivs = None
# read genotype array
log.log('Reading genotypes from {fname}'.format(fname=array_file))
geno_array = array_obj(array_file, n, array_snps, keep_snps=keep_snps,
keep_indivs=keep_indivs, mafMin=args.maf)
# filter annot_matrix down to only SNPs passing MAF cutoffs
if annot_matrix is not None:
annot_keep = geno_array.kept_snps
annot_matrix = annot_matrix[annot_keep,:]
# determine block widths
x = np.array((args.ld_wind_snps, args.ld_wind_kb, args.ld_wind_cm), dtype=bool)
if np.sum(x) != 1:
raise ValueError('Must specify exactly one --ld-wind option')
if args.ld_wind_snps:
max_dist = args.ld_wind_snps
coords = np.array(xrange(geno_array.m))
elif args.ld_wind_kb:
max_dist = args.ld_wind_kb*1000
coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
elif args.ld_wind_cm:
max_dist = args.ld_wind_cm
coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]
block_left = ld.getBlockLefts(coords, max_dist)
if block_left[len(block_left)-1] == 0 and not args.yes_really:
error_msg = 'Do you really want to compute whole-chomosome LD Score? If so, set the '
error_msg += '--yes-really flag (warning: it will use a lot of time / memory)'
raise ValueError(error_msg)
scale_suffix = ''
if args.pq_exp is not None:
log.log('Computing LD with pq ^ {S}.'.format(S=args.pq_exp))
msg = 'Note that LD Scores with pq raised to a nonzero power are'
msg += 'not directly comparable to normal LD Scores.'
log.log(msg)
scale_suffix = '_S{S}'.format(S=args.pq_exp)
pq = np.matrix(geno_array.maf*(1-geno_array.maf)).reshape((geno_array.m, 1))
pq = np.power(pq, args.pq_exp)
if annot_matrix is not None:
annot_matrix = np.multiply(annot_matrix, pq)
else:
annot_matrix = pq
log.log("Estimating LD Score.")
lN = geno_array.ldScoreVarBlocks(block_left, args.chunk_size, annot=annot_matrix)
col_prefix = "L2"; file_suffix = "l2"
if n_annot == 1:
ldscore_colnames = [col_prefix+scale_suffix]
else:
ldscore_colnames = [y+col_prefix+scale_suffix for y in annot_colnames]
# print .ldscore. Output columns: CHR, BP, RS, [LD Scores]
out_fname = args.out + '.' + file_suffix + '.ldscore'
new_colnames = geno_array.colnames + ldscore_colnames
df = pd.DataFrame.from_records(np.c_[geno_array.df, lN])
df.columns = new_colnames
if args.print_snps:
if args.print_snps.endswith('gz'):
print_snps = pd.read_csv(args.print_snps, header=None, compression='gzip')
elif args.print_snps.endswith('bz2'):
print_snps = pd.read_csv(args.print_snps, header=None, compression='bz2')
else:
print_snps = pd.read_csv(args.print_snps, header=None)
if len(print_snps.columns) > 1:
raise ValueError('--print-snps must refer to a file with a one column of SNP IDs.')
log.log('Reading list of {N} SNPs for which to print LD Scores from {F}'.format(\
F=args.print_snps, N=len(print_snps)))
print_snps.columns=['SNP']
df = df.ix[df.SNP.isin(print_snps.SNP),:]
if len(df) == 0:
raise ValueError('After merging with --print-snps, no SNPs remain.')
else:
msg = 'After merging with --print-snps, LD Scores for {N} SNPs will be printed.'
log.log(msg.format(N=len(df)))
l2_suffix = '.gz'
log.log("Writing LD Scores for {N} SNPs to {f}.gz".format(f=out_fname, N=len(df)))
df.drop(['CM','MAF'], axis=1).to_csv(out_fname, sep="\t", header=True, index=False,
float_format='%.3f')
call(['gzip', '-f', out_fname])
if annot_matrix is not None:
M = np.atleast_1d(np.squeeze(np.asarray(np.sum(annot_matrix, axis=0))))
ii = geno_array.maf > 0.05
M_5_50 = np.atleast_1d(np.squeeze(np.asarray(np.sum(annot_matrix[ii,:], axis=0))))
else:
M = [geno_array.m]
M_5_50 = [np.sum(geno_array.maf > 0.05)]
# print .M
fout_M = open(args.out + '.'+ file_suffix +'.M','wb')
print >>fout_M, '\t'.join(map(str,M))
fout_M.close()
# print .M_5_50
fout_M_5_50 = open(args.out + '.'+ file_suffix +'.M_5_50','wb')
print >>fout_M_5_50, '\t'.join(map(str,M_5_50))
fout_M_5_50.close()
# print annot matrix
if (args.cts_bin is not None) and not args.no_print_annot:
out_fname_annot = args.out + '.annot'
new_colnames = geno_array.colnames + ldscore_colnames
annot_df = pd.DataFrame(np.c_[geno_array.df, annot_matrix])
annot_df.columns = new_colnames
del annot_df['MAF']
log.log("Writing annot matrix produced by --cts-bin to {F}".format(F=out_fname+'.gz'))
annot_df.to_csv(out_fname_annot, sep="\t", header=True, index=False)
call(['gzip', '-f', out_fname_annot])
# print LD Score summary
pd.set_option('display.max_rows', 200)
log.log('\nSummary of LD Scores in {F}'.format(F=out_fname+l2_suffix))
t = df.ix[:,4:].describe()
log.log( t.ix[1:,:] )
np.seterr(divide='ignore', invalid='ignore') # print NaN instead of weird errors
# print correlation matrix including all LD Scores and sample MAF
log.log('')
log.log('MAF/LD Score Correlation Matrix')
log.log( df.ix[:,4:].corr() )
# print condition number
if n_annot > 1: # condition number of a column vector w/ nonzero var is trivially one
log.log('\nLD Score Matrix Condition Number')
cond_num = np.linalg.cond(df.ix[:,5:])
log.log( reg.remove_brackets(str(np.matrix(cond_num))) )
if cond_num > 10000:
log.log('WARNING: ill-conditioned LD Score Matrix!')
# summarize annot matrix if there is one
if annot_matrix is not None:
# covariance matrix
x = pd.DataFrame(annot_matrix, columns=annot_colnames)
log.log('\nAnnotation Correlation Matrix')
log.log( x.corr() )
# column sums
log.log('\nAnnotation Matrix Column Sums')
log.log(_remove_dtype(x.sum(axis=0)))
# row sums
log.log('\nSummary of Annotation Matrix Row Sums')
row_sums = x.sum(axis=1).describe()
log.log(_remove_dtype(row_sums))
np.seterr(divide='raise', invalid='raise')
parser = argparse.ArgumentParser()
parser.add_argument('--out', default='ldsc', type=str,
help='Output filename prefix. If --out is not set, LDSC will use ldsc as the '
'defualt output filename prefix.')
# Basic LD Score Estimation Flags'
parser.add_argument('--bfile', default=None, type=str,
help='Prefix for Plink .bed/.bim/.fam file')
parser.add_argument('--l2', default=False, action='store_true',
help='Estimate l2. Compatible with both jackknife and non-jackknife.')
# Filtering / Data Management for LD Score
parser.add_argument('--extract', default=None, type=str,
help='File with SNPs to include in LD Score estimation. '
'The file should contain one SNP ID per row.')
parser.add_argument('--keep', default=None, type=str,
help='File with individuals to include in LD Score estimation. '
'The file should contain one individual ID per row.')
parser.add_argument('--ld-wind-snps', default=None, type=int,
help='Specify the window size to be used for estimating LD Scores in units of '
'# of SNPs. You can only specify one --ld-wind-* option.')
parser.add_argument('--ld-wind-kb', default=None, type=float,
help='Specify the window size to be used for estimating LD Scores in units of '
'kilobase-pairs (kb). You can only specify one --ld-wind-* option.')
parser.add_argument('--ld-wind-cm', default=None, type=float,
help='Specify the window size to be used for estimating LD Scores in units of '
'centiMorgans (cM). You can only specify one --ld-wind-* option.')
parser.add_argument('--print-snps', default=None, type=str,
help='This flag tells LDSC to only print LD Scores for the SNPs listed '
'(one ID per row) in PRINT_SNPS. The sum r^2 will still include SNPs not in '
'PRINT_SNPs. This is useful for reducing the number of LD Scores that have to be '
'read into memory when estimating h2 or rg.' )
# Fancy LD Score Estimation Flags
parser.add_argument('--annot', default=None, type=str,
help='Filename prefix for annotation file for partitioned LD Score estimation. '
'LDSC will automatically append .annot or .annot.gz to the filename prefix. '
'See docs/file_formats_ld for a definition of the .annot format.')
parser.add_argument('--cts-bin', default=None, type=str,
help='This flag tells LDSC to compute partitioned LD Scores, where the partition '
'is defined by cutting one or several continuous variable[s] into bins. '
'The argument to this flag should be the name of a single file or a comma-separated '
'list of files. The file format is two columns, with SNP IDs in the first column '
'and the continuous variable in the second column. ')
parser.add_argument('--cts-breaks', default=None, type=str,
help='Use this flag to specify names for the continuous variables cut into bins '
'with --cts-bin. For each continuous variable, specify breaks as a comma-separated '
'list of breakpoints, and separate the breakpoints for each variable with an x. '
'For example, if binning on MAF and distance to gene (in kb), '
'you might set --cts-breaks 0.1,0.25,0.4x10,100,1000 ')
parser.add_argument('--cts-names', default=None, type=str,
help='Use this flag to specify names for the continuous variables cut into bins '
'with --cts-bin. The argument to this flag should be a comma-separated list of '
'names. For example, if binning on DAF and distance to gene, you might set '
'--cts-bin DAF,DIST_TO_GENE ')
parser.add_argument('--per-allele', default=False, action='store_true',
help='Setting this flag causes LDSC to compute per-allele LD Scores, '
'i.e., \ell_j := \sum_k p_k(1-p_k)r^2_{jk}, where p_k denotes the MAF '
'of SNP j. ')
parser.add_argument('--pq-exp', default=None, type=float,
help='Setting this flag causes LDSC to compute LD Scores with the given scale factor, '
'i.e., \ell_j := \sum_k (p_k(1-p_k))^a r^2_{jk}, where p_k denotes the MAF '
'of SNP j and a is the argument to --pq-exp. ')
parser.add_argument('--no-print-annot', default=False, action='store_true',
help='By defualt, seting --cts-bin or --cts-bin-add causes LDSC to print '
'the resulting annot matrix. Setting --no-print-annot tells LDSC not '
'to print the annot matrix. ')
parser.add_argument('--maf', default=None, type=float,
help='Minor allele frequency lower bound. Default is MAF > 0.')
# Basic Flags for Working with Variance Components
parser.add_argument('--h2', default=None, type=str,
help='Filename prefix for a .chisq file for one-phenotype LD Score regression. '
'LDSC will automatically append .chisq or .chisq.gz to the filename prefix.'
'--h2 requires at minimum also setting the --ref-ld and --w-ld flags.')
parser.add_argument('--rg', default=None, type=str,
help='Comma-separated list of prefixes of .chisq filed for genetic correlation estimation.')
parser.add_argument('--ref-ld', default=None, type=str,
help='Use --ref-ld to tell LDSC which LD Scores to use as the predictors in the LD '
'Score regression. '
'LDSC will automatically append .l2.ldscore/.l2.ldscore.gz to the filename prefix.')
parser.add_argument('--ref-ld-chr', default=None, type=str,
help='Same as --ref-ld, but will automatically concatenate .l2.ldscore files split '
'across 22 chromosomes. LDSC will automatically append .l2.ldscore/.l2.ldscore.gz '
'to the filename prefix. If the filename prefix contains the symbol @, LDSC will '
'replace the @ symbol with chromosome numbers. Otherwise, LDSC will append chromosome '
'numbers to the end of the filename prefix.'
'Example 1: --ref-ld-chr ld/ will read ld/1.l2.ldscore.gz ... ld/22.l2.ldscore.gz'
'Example 2: --ref-ld-chr ld/@_kg will read ld/1_kg.l2.ldscore.gz ... ld/22_kg.l2.ldscore.gz')
parser.add_argument('--w-ld', default=None, type=str,
help='Filename prefix for file with LD Scores with sum r^2 taken over SNPs included '
'in the regression. LDSC will automatically append .l2.ldscore/.l2.ldscore.gz.')
parser.add_argument('--w-ld-chr', default=None, type=str,
help='Same as --w-ld, but will read files split into 22 chromosomes in the same '
'manner as --ref-ld-chr.')
parser.add_argument('--overlap-annot', default=False, action='store_true',
help='This flag informs LDSC that the partitioned LD Scores were generates using an '
'annot matrix with overlapping categories (i.e., not all row sums equal 1), '
'and prevents LDSC from displaying output that is meaningless with overlapping categories.')
parser.add_argument('--no-intercept', action='store_true',
help = 'If used with --h2, this constrains the LD Score regression intercept to equal '
'1. If used with --rg, this constrains the LD Score regression intercepts for the h2 '
'estimates to be one and the intercept for the genetic covariance estimate to be zero.')
parser.add_argument('--intercept-h2', action='store', default=None,
help = 'Intercepts for constrained-intercept single-trait LD Score regression.')
parser.add_argument('--intercept-gencov', action='store', default=None,
help = 'Intercepts for constrained-intercept cross-trait LD Score regression.'
' Must have same length as --rg. The first entry is ignored.')
parser.add_argument('--M', default=None, type=str,
help='# of SNPs (if you don\'t want to use the .l2.M files that came with your .l2.ldscore.gz files)')
parser.add_argument('--two-step', default=None, type=float,
help='Test statistic bound for use with the two-step estimator. Not compatible with --no-intercept and --constrain-intercept.')
parser.add_argument('--chisq-max', default=None, type=float,
help='Max chi^2.')
# Flags for both LD Score estimation and h2/gencor estimation
parser.add_argument('--print-cov', default=False, action='store_true',
help='For use with --h2/--rg. This flag tells LDSC to print the '
'covaraince matrix of the estimates.')
# frequency (useful for .bin files)
parser.add_argument('--print-delete-vals', default=False, action='store_true',
help='If this flag is set, LDSC will print the block jackknife delete-values ('
'i.e., the regression coefficeints estimated from the data with a block removed). '
'The delete-values are formatted as a matrix with (# of jackknife blocks) rows and '
'(# of LD Scores) columns.')
# Flags you should almost never use
parser.add_argument('--chunk-size', default=50, type=int,
help='Chunk size for LD Score calculation. Use the default.')
parser.add_argument('--pickle', default=False, action='store_true',
help='Store .l2.ldscore files as pickles instead of gzipped tab-delimited text.')
parser.add_argument('--yes-really', default=False, action='store_true',
help='Yes, I really want to compute whole-chromosome LD Score.')
parser.add_argument('--invert-anyway', default=False, action='store_true',
help="Force LDSC to attempt to invert ill-conditioned matrices.")
parser.add_argument('--n-blocks', default=200, type=int,
help='Number of block jackknife blocks.')
parser.add_argument('--not-M-5-50', default=False, action='store_true',
help='This flag tells LDSC to use the .l2.M file instead of the .l2.M_5_50 file.')
parser.add_argument('--return-silly-things', default=False, action='store_true',
help='Force ldsc to return silly genetic correlation estimates.')
parser.add_argument('--no-check-alleles', default=False, action='store_true',
help='For rg estimation, skip checking whether the alleles match. This check is '
'redundant for pairs of chisq files generated using munge_sumstats.py and the '
'same argument to the --merge-alleles flag.')
parser.add_argument('--print-coefficients',default=False,action='store_true',
help='when categories are overlapping, print coefficients as well as heritabilities.')
parser.add_argument('--frqfile', type=str,
help='For use with --overlap-annot. Provides allele frequencies to prune to common '
'snps if --not-M-5-50 is not set.')
parser.add_argument('--frqfile-chr', type=str,
help='Prefix for --frqfile files split over chromosome.')
# transform to liability scale
parser.add_argument('--samp-prev',default=None,
help='Sample prevalence of binary phenotype (for conversion to liability scale).')
parser.add_argument('--pop-prev',default=None,
help='Population prevalence of binary phenotype (for conversion to liability scale).')
if __name__ == '__main__':
args = parser.parse_args()
if args.out is None:
raise ValueError('--out is required.')
log = Logger(args.out+'.log')
try:
defaults = vars(parser.parse_args(''))
opts = vars(args)
non_defaults = [x for x in opts.keys() if opts[x] != defaults[x]]
header = MASTHEAD
header += "Call: \n"
header += './ldsc.py \\\n'
options = ['--'+x.replace('_','-')+' '+str(opts[x])+' \\' for x in non_defaults]
header += '\n'.join(options).replace('True','').replace('False','')
header = header[0:-1]+'\n'
log.log(header)
log.log('Beginning analysis at {T}'.format(T=time.ctime()))
start_time = time.time()
if args.n_blocks <= 1:
raise ValueError('--n-blocks must be an integer > 1.')
if args.bfile is not None:
if args.l2 is None:
raise ValueError('Must specify --l2 with --bfile.')
if args.annot is not None and args.extract is not None:
raise ValueError('--annot and --extract are currently incompatible.')
if args.cts_bin is not None and args.extract is not None:
raise ValueError('--cts-bin and --extract are currently incompatible.')
if args.annot is not None and args.cts_bin is not None:
raise ValueError('--annot and --cts-bin are currently incompatible.')
if (args.cts_bin is not None) != (args.cts_breaks is not None):
raise ValueError('Must set both or neither of --cts-bin and --cts-breaks.')
if args.per_allele and args.pq_exp is not None:
raise ValueError('Cannot set both --per-allele and --pq-exp (--per-allele is equivalent to --pq-exp 1).')
if args.per_allele:
args.pq_exp = 1
ldscore(args, log)
# summary statistics
elif (args.h2 or args.rg) and (args.ref_ld or args.ref_ld_chr) and (args.w_ld or args.w_ld_chr):
if args.h2 is not None and args.rg is not None:
raise ValueError('Cannot set both --h2 and --rg.')
if args.ref_ld and args.ref_ld_chr:
raise ValueError('Cannot set both --ref-ld and --ref-ld-chr.')
if args.w_ld and args.w_ld_chr:
raise ValueError('Cannot set both --w-ld and --w-ld-chr.')
if (args.samp_prev is not None) != (args.pop_prev is not None):
raise ValueError('Must set both or neither of --samp-prev and --pop-prev.')
if not args.overlap_annot or args.not_M_5_50:
if args.frqfile is not None or args.frqfile_chr is not None:
log.log('The frequency file is unnecessary and is being ignored.')
args.frqfile = None
args.frqfile_chr = None
if args.overlap_annot and not args.not_M_5_50:
if not ((args.frqfile and args.ref_ld) or (args.frqfile_chr and args.ref_ld_chr)):
raise ValueError ('Must set either --frqfile and --ref-ld or --frqfile-chr and --ref-ld-chr')
if args.rg:
sumstats.estimate_rg(args, log)
elif args.h2:
sumstats.estimate_h2(args, log)
# bad flags
else:
print header
print 'Error: no analysis selected.'
print 'ldsc.py -h describes options.'
except Exception:
ex_type, ex, tb = sys.exc_info()
log.log( traceback.format_exc(ex) )
raise
finally:
log.log('Analysis finished at {T}'.format(T=time.ctime()) )
time_elapsed = round(time.time()-start_time,2)
log.log('Total time elapsed: {T}'.format(T=sec_to_str(time_elapsed)))