-
Notifications
You must be signed in to change notification settings - Fork 2
/
plot_snpHap.py
129 lines (111 loc) · 4.24 KB
/
plot_snpHap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 4 10:13:00 2017
@author: Caiyd
"""
import os
import click
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
def load_bcf2df(bcffile, region, regions_file):
snp_list = []
index_list = []
sw_dict = {r'1|1': 2,
r'1|0': 1,
r'0|1': 1,
r'0|0': 0,
r'1/1': 2,
r'0/1': 1,
r'0/0': 0,
r'./.': np.nan}
if regions_file:
cmd = f'''bcftools view -i 'TYPE=="snp" & N_ALT==1' -R {regions_file} {bcffile}'''
elif region:
cmd = f'''bcftools view -i 'TYPE=="snp" & N_ALT==1' {bcffile} {region}'''
else:
cmd = f'''bcftools view -i 'TYPE=="snp" & N_ALT==1' {bcffile}'''
for line in os.popen(cmd):
if line[0] != '#':
line = line.strip().split()
snp_list.append([sw_dict[x.split(':')[0]] for x in line[9:]])
index_list.append('%s:%s' % (line[0], line[1]))
elif line[:6] == '#CHROM':
name_list = line.strip().split('\t')[9:]
return pd.DataFrame(snp_list, columns=name_list, index=index_list)
def load_sample_order(order_file):
sample_order_list = []
with open(order_file) as f:
for line in f:
if line[0] != '#':
line = line.strip()
sample_order_list.append(line)
return sample_order_list
def filter_af(df, afcutoff):
"""
allele freq
过滤掉ALT频率低于cutoff的位点
"""
print(f'filter alt allele frequence, cutoff is {afcutoff}')
print(f'before: {df.shape}')
afarray = np.nansum(df.values, axis=1) / (np.sum(~np.isnan(df.values), axis=1) * 2)
newdf = df.loc[afarray>=afcutoff, :]
print(f'after: {newdf.shape}')
return newdf
def filter_maf(df, afcutoff):
"""
allele freq
过滤掉minor allele frequency频率低于cutoff的位点
"""
print(f'filter minor allele frequence, cutoff is {afcutoff}')
print(f'before: {df.shape}')
afarray = np.nansum(df.values, axis=1) / (np.sum(~np.isnan(df.values), axis=1) * 2)
afarray[afarray>0.5] = 1 - afarray[afarray>0.5]
print(f'max MAF: {max(afarray)}')
print(f'min MAF: {min(afarray)}')
newdf = df.loc[afarray>=afcutoff, :]
print(f'after: {newdf.shape}')
return newdf
def plot(df, font_scale, outfile):
fig, ax = plt.subplots(1, 1, sharex=True, figsize=(30, 10))
ax.set_facecolor('grey')
# sns.set(font_scale=font_scale)
sns.heatmap(df.T,
cmap='Oranges',
square=False,
cbar=False,
linewidths=0,
linecolor='k',
ax=ax)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
label.set_fontsize(font_scale)
plt.savefig(outfile, dpi=500)
@click.command()
@click.option('--bcffile', help='输入的bcf文件')
@click.option('--region', help='要画的区域,如12:1000-2000', default=None)
@click.option('--regions-file', help='区域文件,默认None,覆盖--region', default=None)
@click.option('--orderfile', help='个体ID顺序,画出的图会按这个排序,一行一个个体ID')
@click.option('--afcutoff', type=float, help='alt的频率,低于这个频率的snp不画', default=None)
@click.option('--mafcutoff', type=float, help='minor allele的频率,低于这个频率的snp不画', default=None)
@click.option('--font-scale', help='坐标轴文字大小尺度, 默认是1', default=1, type=float)
@click.option('--outfile', help='输出文件')
def main(bcffile, region, regions_file, orderfile, afcutoff, mafcutoff, font_scale, outfile):
"""
从bcf文件中画snp的单倍型热图
"""
print('---plot_snpHap---')
df = load_bcf2df(bcffile, region, regions_file)
print('load snp done.')
sample_order_list = load_sample_order(orderfile)
df = df[sample_order_list]
print('sort order done.')
if afcutoff:
df = filter_af(df, afcutoff)
if mafcutoff:
df = filter_maf(df, mafcutoff)
plot(df, font_scale, outfile)
if __name__ == '__main__':
main()