forked from kircherlab/CADD-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
114 lines (105 loc) · 3.44 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""Note that we are declaring many files temporary here that should be located in a temporary folder to begin with"""
rule decompress:
input: '{file}.vcf.gz'
output: temp('{file}.vcf')
shell:
'''
zcat {input} > {output}
'''
rule prepare:
input: '{file}.vcf'
output: temp('{file}.prepared.vcf')
conda: 'envs/environment.yml'
shell:
'''
cat {input} \
| python $CADD/src/scripts/VCF2vepVCF.py \
| sort -k1,1 -k2,2n -k3,3 -k4,4 \
| uniq > {output}
'''
rule prescore:
input: '{file}.prepared.vcf'
output:
novel=temp('{file}.novel.vcf'),
prescored=temp('{file}.pre.tsv')
conda: 'envs/environment.yml'
shell:
'''
# Prescoring
echo '## Prescored variant file' > {output.prescored};
if [ -d $CADD/{config[PrescoredFolder]} ]
then
for PRESCORED in $(ls $CADD/{config[PrescoredFolder]}/*.tsv.gz)
do
cat {input} \
| python $CADD/src/scripts/extract_scored.py --header \
-p $PRESCORED --found_out={output.prescored}.tmp \
> {input}.tmp;
cat {output.prescored}.tmp >> {output.prescored}
mv {input}.tmp {input};
done;
rm {output.prescored}.tmp
fi
mv {input} {output.novel}
'''
rule annotation:
input: '{file}.novel.vcf'
output: temp('{file}.anno.tsv.gz')
conda: 'envs/environment.yml'
shell:
'''
cat {input} \
| vep --quiet --cache --offline --dir $CADD/{config[VEPpath]} \
--buffer 1000 --no_stats --species homo_sapiens \
--db_version={config[EnsemblDB]} --assembly {config[GenomeBuild]} \
--format vcf --regulatory --sift b --polyphen b --per_gene --ccds --domains \
--numbers --canonical --total_length --vcf --force_overwrite --output_file STDOUT \
| python $CADD/src/scripts/annotateVEPvcf.py \
-c $CADD/{config[ReferenceConfig]} \
| gzip -c > {output}
'''
rule imputation:
input: '{file}.anno.tsv.gz'
output: temp('{file}.csv.gz')
conda: 'envs/environment.yml'
shell:
'''
zcat {input} \
| python $CADD/src/scripts/trackTransformation.py -b \
-c $CADD/{config[ImputeConfig]} -o {output} --noheader;
'''
rule score:
input:
impute='{file}.csv.gz',
anno='{file}.anno.tsv.gz'
output: temp('{file}.novel.tsv')
conda: 'envs/environment.yml'
shell:
'''
python $CADD/src/scripts/predictSKmodel.py \
-i {input.impute} -m $CADD/{config[Model]} -a {input.anno} \
| python $CADD/src/scripts/max_line_hierarchy.py --all \
| python $CADD/src/scripts/appendPHREDscore.py \
-t $CADD/{config[ConversionTable]} > {output};
if [ "{config[Annotation]}" = 'False' ]
then
cat {output} | cut -f {config[Columns]} | uniq > {output}.tmp
mv {output}.tmp {output}
fi
'''
rule join:
input:
pre='{file}.pre.tsv',
novel='{file}.novel.tsv'
output: '{file}.tsv.gz'
conda: 'envs/environment.yml'
shell:
'''
(
echo "{config[Header]}";
head -n 1 {input.novel};
cat {input.pre} {input.novel} \
| grep -v "^#" \
| sort -k1,1 -k2,2n -k3,3 -k4,4
) | bgzip -c > {output};
'''