-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfilter_and_liftover.submit.sh
executable file
·135 lines (116 loc) · 5.27 KB
/
filter_and_liftover.submit.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/bin/bash
#PBS -N filter_and_liftover
#PBS -l nodes=1:ppn=1
#PBS -l mem=8gb
#PBS -q analysis
#PBS -d /discreps
#PBS -e /discreps/log/filter.err
#PBS -o /discreps/log/filter.out
#PBS -t 1-300
# This script is used to submit jobs for variant filtering and variant lift-over
# The project-level .bcf files are divided into 300 chunks to enable parallele computing
# The code has been refactored. Please submit an issue or contact the author if there is problem or question
baseDir=/discreps
max_allele_INDEL=3
max_allele_SNV=4
total_sample=1572
version=$1 # specify version of the reference [hg19|hg38]
assignMissing=/utils/assignMissing.py
if [[ "$version" == "hg19" ]]; then
reference=/reference/hs37d5/hs37d5.fa # NOT included in github submission
reference_other=/reference/GRCh38_1000Genomes/reference.fa # NOT included in github submission
chain_file=/reference/hg37ToHg38.over.chain # downloaded from UCSC genome browser
exon_capture=/reference/HG19_vcrome2.1_with_PKv2.100bp.bed.gz
chunkfile=/reference/reference_hg19.auto.chunks
elif [[ "$version" == "hg38" ]]; then
reference=/reference/GRCh38_1000Genomes/reference.fa # NOT included in github submission
reference_other=/reference/hs37d5/hs37d5.fa # NOT included in github submission
chain_file=/reference/hg38ToHg37.over.chain # downloaded from UCSC genome browser
exon_capture=/reference/HG38_vcrome2.1_with_PKv2.100bp.bed.gz
chunkfile=/reference_hg38.auto.chunks
else
echo "Specify either hg19 or hg38 as version"; exit 1
fi
# read chromosome position of each chunk
chunk=$PBS_ARRAYID
line=`sed -n "${PBS_ARRAYID}p" $chunkfile`
line=($line)
chr=${line[0]}
start=${line[1]}
end=${line[2]}
# choose filtering criteria
dps=(15) # minimum depth
ratios=(0.25) # minimum minor allelic ratio
genos=(0.85) # minimum genotyping rate across all samples
# We also tried other different combinations of QC criteria
# dps=(5 10 15 20 25)
# ratios=(0.15 0.2 0.25 0.3 0.35)
# genos=(0.75 0.8 0.85 0.9 0.95)
for type in SNV INDEL; do
prefix=$version.$type
input=/inputs/$prefix.bcf # obtain from dbGaP
workingDir=/by_chunk/$chunk
chr_vcf=$workingDir/$prefix.vcf.gz # vcf file separated by each chr chunk
mkdir -p $workingDir/ID $workingDir/assignMissing
cd $workingDir
echo -e "chunk: $chunk, $chr:$start-$end" > chunk
# different filtering criteria for SNV and INDEL for multi-allelic variants
if [[ $type == "SNV" ]]; then
max_allele=$max_allele_SNV
else
max_allele=$max_allele_INDEL
fi
# 1. separate the file into chunk
bcftools view -r $chr:$start-$end -M $max_allele -f '.' $input \
| awk -v start=$start -v end=$end '$1~/^#/ || ($2>=start && $2<=end)' \
| bgzip -c > $chr_vcf
varCount=`bcftools stats $chr_vcf | grep "number of records:" | awk -v FS="\t" '{print $4}'`
echo "varCount: $varCount" >> chunk
# determine whether there is variant within this chunk
if [[ $varCount -eq 0 ]]; then touch warning.noVariants; fi
if [[ ! -f warning.noVariants ]]; then
tabix -p vcf $chr_vcf
bcftools view -R $exon_capture $chr_vcf \
| bgzip -c > $chr_vcf.2
mv $chr_vcf.2 $chr_vcf
rm $chr_vcf.tbi
varCount_exon=`bcftools stats $chr_vcf | grep "number of records:" | awk -v FS="\t" '{print $4}'`
echo "varCount_exon: $varCount_exon" >> chunk
# determine whether there is variant in exon capture regions within this chunk
if [[ $varCount_exon -eq 0 ]]; then touch warning.noExonVariants; fi
if [[ ! -f warning.noExonVariants ]]; then
tabix -p vcf $chr_vcf
fi
fi
if [[ ! -f warning.noVariants ]] && [[ ! -f warning.noExonVariants ]]; then
# 2. liftover
java -Xmx7500m -Xms7500m \
-jar picard.jar LiftoverVcf \
I=$chr_vcf \
O=$prefix.lifted.vcf \
CHAIN=$chain_file \
REJECT=$prefix.lifted.reject.vcf \
RECOVER_SWAPPED_REF_ALT=true \
TAGS_TO_DROP=AC \
R=$reference_other
bcftools plugin fill-tags $prefix.lifted.vcf -- -t AC \
| vcf-sort -c | bgzip -c > $prefix.lifted.vcf.gz
tabix -p vcf $prefix.lifted.vcf.gz
rm $prefix.lifted.vcf $prefix.lifted.vcf.idx
rejectCount=`bcftools stats $prefix.lifted.reject.vcf | grep "number of records:" | awk -v FS="\t" '{print $4}'`
if [[ $rejectCount -eq 0 ]]; then rm $prefix.lifted.reject.vcf; fi
# 3. assign missing variants based on chosen criteria
# different QC criteria were tested, but only use was finally chosen and used in this study
for depth_thres in ${dps[@]}; do for allelic_ratio_thres in ${ratios[@]}; do for genotypingRateThres in ${genos[@]}; do
total_AN=`echo "$total_sample * 2 * $genotypingRateThres" | bc -l`
python $assignMissing -i $chr_vcf -d $depth_thres -r $allelic_ratio_thres \
| bcftools plugin fill-tags -- -t AN,AF,AC \
| bcftools norm -c wx -f $reference -m- -Ou \
| bcftools norm -d none -Ou \
| bcftools view -i "AN>$total_AN & AC>0" -Ou \
| bcftools annotate --set-id '%CHROM:%POS:%REF:%ALT' \
| bgzip -c > $prefix.filtered.vcf.gz
tabix -p vcf $prefix.filtered.vcf.gz
done; done; done
fi
done