-
Notifications
You must be signed in to change notification settings - Fork 3
/
Snakefile_genotype_sites
executable file
·136 lines (116 loc) · 4.9 KB
/
Snakefile_genotype_sites
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
Snakefile for genotyping somatic variants for single-cell fibroblast project (Raghd Rostom)
Author: Davis McCarthy
Affiliation: EMBL-EBI
Run: snakemake -s Snakefile_genotype_sites --jobs 400 --latency-wait 30 --cluster-config ../../cluster.json --cluster 'bsub -J {cluster.name} -q {cluster.queue} -n {cluster.n} -R "select[singularity] rusage[mem={cluster.memory}]" -M {cluster.memory} -o {cluster.output} -e {cluster.error}'
Davis McCarthy, 03 November 2017
"""
import glob
import os
from subprocess import run
import subprocess
import pandas as pd
import re
import h5py
shell.prefix("set -euo pipefail;")
## reference files
human_gx_fasta = 'references/GRCh37.p13.genome.ERCC92.fa'
fasta_unzipped = human_gx_fasta
fasta_dict = fasta_unzipped.replace('fa', 'dict')
fasta_idx = fasta_unzipped + '.fai'
somatic_vars_file = 'data/exome-point-mutations/high-vs-low-exomes.v62.regions_to_call.tsv'
## parameter objects and samples
donors_short_id = donors_lenient_all = ['euts', 'fawm', 'feec', 'fikt', \
'garx', 'gesg', 'heja', 'hipn', 'ieki', 'joxm', 'kuco', 'laey', 'lexy', 'melw', \
'naju', 'nusw', 'oaaz', 'oilg', 'pipw', 'puie', 'qayj', 'qolg', 'qonc', 'rozh', \
'sehl', 'sohd', 'ualf', 'vass', 'vils', 'vuna', 'wahn', 'wetu', 'xugn', 'zoxy']
donor_cell_map_dict = {}
donor_run_map_dict = {}
for donor in donors_short_id:
with open(os.path.join('data/donor-cell-lists', donor + '.qc-pass.cells.txt')) as f:
tmp = f.readlines()
donor_cell_map_dict[donor] = [str.strip() for str in tmp]
donor_run_map_dict[donor] = [re.sub('#.*', '', str) for str in donor_cell_map_dict[donor]]
## read in crams from SS2 run
run_dirs = glob.glob('data/raw/22*')
crams_all = glob.glob('data/raw/22*/cram/*.cram')
fastq_all = glob.glob('data/raw/22*/fastq/*_1.fastq')
## define sample names
RUN_SAMPLES = {}
for run in run_dirs:
# crams = glob.glob(os.path.join(run, 'cram/*.cram'))
fastqs = glob.glob(os.path.join(run, 'fastq/*_1.fastq'))
# RUN_SAMPLES[run] = [os.path.basename(w).replace('.cram', '') for w in crams]
RUN_SAMPLES[run] = [os.path.basename(w).replace('_1.fastq', '') for w in fastqs]
# SAMPLES = [os.path.basename(w).replace('.cram', '') for w in crams_all]
SAMPLES = [os.path.basename(w).replace('_1.fastq', '') for w in fastq_all]
donor_star_bams = {}
donor_vcf_files_mpileup = []
donor_bams = []
for donor in donors_short_id:
donor_star_bams[donor] = ['data/raw/{}/star/{}/{}.2pass.Aligned.sortedByCoord.split.realigned.bqsr.bam'.format(i[0], i[1], i[2]) for i in zip(donor_run_map_dict[donor], donor_cell_map_dict[donor], donor_cell_map_dict[donor])]
donor_vcf_files_mpileup.append('data/raw/mpileup/{}.mpileup.vcf.gz'.format(donor))
donor_vcf_files_mpileup.append('data/raw/mpileup/{}.mpileup.vcf.gz.csi'.format(donor))
donor_bams.append('data/raw/bams_cells_by_donor/{}.qc_cells.bam'.format(donor))
rule all:
input:
donor_vcf_files_mpileup,
donor_bams
rule make_mpileup_input_list:
input:
bams=lambda wildcards: donor_star_bams[wildcards.donor]
output:
txt='data/raw/mpileup/{donor}.bam.filenames.txt'
conda:
"envs/myenv.yaml"
singularity:
"docker://davismcc/fibroblast-clonality"
run:
with open(output.txt, 'w') as f:
for item in input.bams:
f.write('{}\n'.format(item))
rule call_variants_mpileup_fthicov:
input:
fasta=fasta_unzipped,
fai=fasta_idx,
fa_dict=fasta_dict,
bams='data/raw/mpileup/{donor}.bam.filenames.txt',
snps=somatic_vars_file
output:
mpi=temp('data/raw/mpileup/{donor}.filtered.bcf.gz'),
midx=temp('data/raw/mpileup/{donor}.filtered.bcf.gz.csi'),
vcf='data/raw/mpileup/{donor}.mpileup.vcf.gz',
csi='data/raw/mpileup/{donor}.mpileup.vcf.gz.csi'
conda:
"envs/myenv.yaml"
singularity:
"docker://davismcc/fibroblast-clonality"
shell:
"""
mkdir tmp/tmp-{wildcards.donor}
bcftools mpileup -E -Ob --skip-indels -R {input.snps} \
-f {input.fasta} --annotate AD,DP,SP,INFO/AD -o {output.mpi} -b {input.bams}
bcftools index {output.mpi}
bcftools call -R {input.snps} -m -Ou {output.mpi} | \
bcftools filter -Ou -i'DP>3 && QUAL>20' | \
bcftools sort -T tmp/tmp-{wildcards.donor} --max-mem 2G -Oz -o {output.vcf}
bcftools index {output.vcf}
"""
rule merge_bams_by_donor:
input:
fasta=fasta_unzipped,
fai=fasta_idx,
fa_dict=fasta_dict,
bams='data/raw/mpileup/{donor}.bam.filenames.txt'
output:
bam='data/raw/bams_cells_by_donor/{donor}.qc_cells.bam',
bai='data/raw/bams_cells_by_donor/{donor}.qc_cells.bam.bai'
conda:
"envs/myenv.yaml"
singularity:
"docker://davismcc/fibroblast-clonality"
shell:
"""
samtools merge -r -b {input.bams} {output.bam}
samtools index {output.bam} {output.bai}
"""