Skip to content

Commit

Permalink
add coverage analysis step
Browse files Browse the repository at this point in the history
  • Loading branch information
priesgo committed Nov 21, 2021
1 parent a44185b commit 233ab0b
Show file tree
Hide file tree
Showing 15 changed files with 111 additions and 79 deletions.
4 changes: 3 additions & 1 deletion main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ nextflow.enable.dsl = 2

include { PREPARE_BAM; INDEX_BAM } from './modules/01_prepare_bam'
include { MARK_DUPLICATES } from './modules/02_mark_duplicates'
include { METRICS; HS_METRICS } from './modules/03_metrics'
include { METRICS; HS_METRICS; COVERAGE_ANALYSIS } from './modules/03_metrics'
include { REALIGNMENT_AROUND_INDELS } from './modules/04_realignment_around_indels'
include { BQSR; CREATE_OUTPUT } from './modules/05_bqsr'

Expand All @@ -17,6 +17,7 @@ params.dbsnp = false
params.known_indels1 = false
params.known_indels2 = false
params.intervals = false
params.intervals_bed = false
params.skip_bqsr = false
params.skip_realignment = false
params.skip_deduplication = false
Expand Down Expand Up @@ -100,6 +101,7 @@ workflow {
HS_METRICS(deduplicated_bams)
}
METRICS(deduplicated_bams)
COVERAGE_ANALYSIS(deduplicated_bams)
}

if (!params.skip_realignment) {
Expand Down
9 changes: 4 additions & 5 deletions modules/01_prepare_bam.nf
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ process PREPARE_BAM {
tuple val(name), val(type), file(bam)

output:
tuple val(name), val("${bam.baseName}"), val(type), file("${bam.baseName}.prepared.bam"), emit: prepared_bams
tuple val(name), val(type), file("${name}.prepared.bam"), emit: prepared_bams

script:
order = params.skip_deduplication ? "--SORT_ORDER coordinate": "--SORT_ORDER queryname"
Expand All @@ -43,7 +43,7 @@ process PREPARE_BAM {
--java-options '-Xmx${params.prepare_bam_memory} -Djava.io.tmpdir=tmp' \
--VALIDATION_STRINGENCY SILENT \
--INPUT /dev/stdin \
--OUTPUT ${bam.baseName}.prepared.bam \
--OUTPUT ${name}.prepared.bam \
--REFERENCE_SEQUENCE ${params.reference} \
--RGPU 1 \
--RGID 1 \
Expand All @@ -62,11 +62,10 @@ process INDEX_BAM {
conda (params.enable_conda ? "bioconda::gatk4=4.2.0.0" : null)

input:
tuple val(name), val(bam_name), val(type), file(bam)
tuple val(name), val(type), file(bam)

output:
tuple val(name), val(bam_name), val(type),
file("${bam}"), file("${bam.baseName}.bai"), emit: indexed_bams
tuple val(name), val(type), file("${bam}"), file("${bam.baseName}.bai"), emit: indexed_bams

script:
"""
Expand Down
13 changes: 6 additions & 7 deletions modules/02_mark_duplicates.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,28 +9,27 @@ process MARK_DUPLICATES {
cpus "${params.mark_duplicates_cpus}"
memory "${params.mark_duplicates_memory}"
tag "${name}"
publishDir "${params.output}/${name}/metrics", mode: "copy", pattern: "*.dedup_metrics.txt"
publishDir "${params.output}/${name}/metrics/mark_duplicates", mode: "copy", pattern: "*.dedup_metrics.txt"

conda (params.enable_conda ? "bioconda::gatk4=4.2.0.0" : null)

input:
tuple val(name), val(bam_name), val(type), file(bam)
tuple val(name), val(type), file(bam)

output:
tuple val(name), val(bam_name), val(type),
file("${bam.baseName}.dedup.bam"), file("${bam.baseName}.dedup.bam.bai"), emit: deduplicated_bams
file("${bam.baseName}.dedup_metrics.txt") optional true
tuple val(name), val(type), file("${name}.dedup.bam"), file("${name}.dedup.bam.bai"), emit: deduplicated_bams
file("${name}.dedup_metrics.txt") optional true

script:
dedup_metrics = params.skip_metrics ? "": "--metrics-file ${bam.baseName}.dedup_metrics.txt"
dedup_metrics = params.skip_metrics ? "": "--metrics-file ${name}.dedup_metrics.txt"
remove_duplicates = params.remove_duplicates ? "--remove-all-duplicates true" : "--remove-all-duplicates false"
"""
mkdir tmp
gatk MarkDuplicatesSpark \
--java-options '-Xmx${params.mark_duplicates_memory} -Djava.io.tmpdir=tmp' \
--input ${bam} \
--output ${bam.baseName}.dedup.bam \
--output ${name}.dedup.bam \
--conf 'spark.executor.cores=${task.cpus}' ${remove_duplicates} ${dedup_metrics}
"""
}
53 changes: 41 additions & 12 deletions modules/03_metrics.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,25 @@ params.collect_hs_metrics_min_base_quality = false
params.collect_hs_metrics_min_mapping_quality = false
params.reference = false
params.output = 'output'
params.intervals_bed = false
params.intervals = false


process HS_METRICS {
cpus "${params.metrics_cpus}"
memory "${params.metrics_memory}"
cpus params.metrics_cpus
memory params.metrics_memory
tag "${name}"
publishDir "${params.output}/${name}/metrics", mode: "copy"
publishDir "${params.output}/${name}/metrics/hs_metrics", mode: "copy"

conda (params.enable_conda ? "bioconda::gatk4=4.2.0.0" : null)

input:
tuple val(name), val(bam_name), val(type), file(bam), file(bai)
tuple val(name), val(type), file(bam), file(bai)

output:
file("*_metrics") optional true
file("*.pdf") optional true
file("${bam.baseName}.hs_metrics.txt")
file("${name}.hs_metrics.txt")

script:
minimum_base_quality = params.collect_hs_metrics_min_base_quality ?
Expand All @@ -33,23 +35,23 @@ process HS_METRICS {
gatk CollectHsMetrics \
--java-options '-Xmx${params.metrics_memory} -Djava.io.tmpdir=tmp' \
--INPUT ${bam} \
--OUTPUT ${bam.baseName}.hs_metrics.txt \
--OUTPUT ${name}.hs_metrics.txt \
--TARGET_INTERVALS ${params.intervals} \
--BAIT_INTERVALS ${params.intervals} \
${minimum_base_quality} ${minimum_mapping_quality}
"""
}

process METRICS {
cpus "${params.metrics_cpus}"
memory "${params.metrics_memory}"
cpus params.metrics_cpus
memory params.metrics_memory
tag "${name}"
publishDir "${params.output}/${name}/metrics", mode: "copy"
publishDir "${params.output}/${name}/metrics/gatk_multiple_metrics", mode: "copy"

conda (params.enable_conda ? "bioconda::gatk4=4.2.0.0" : null)

input:
tuple val(name), val(bam_name), val(type), file(bam), file(bai)
tuple val(name), val(type), file(bam), file(bai)

output:
file("*_metrics") optional true
Expand All @@ -61,7 +63,7 @@ process METRICS {
gatk CollectMultipleMetrics \
--java-options '-Xmx${params.metrics_memory} -Djava.io.tmpdir=tmp' \
--INPUT ${bam} \
--OUTPUT ${bam.baseName} \
--OUTPUT ${name} \
--REFERENCE_SEQUENCE ${params.reference} \
--PROGRAM QualityScoreDistribution \
--PROGRAM MeanQualityByCycle \
Expand All @@ -72,4 +74,31 @@ process METRICS {
--PROGRAM CollectSequencingArtifactMetrics \
--PROGRAM CollectSequencingArtifactMetrics
"""
}
}

process COVERAGE_ANALYSIS {
cpus params.metrics_cpus
memory params.metrics_memory
tag "${name}"
publishDir "${params.output}/${name}/metrics/coverage", mode: "copy"

conda (params.enable_conda ? "bioconda::samtools=1.12" : null)

input:
tuple val(name), val(type), file(bam), file(bai)

output:
file("${name}.coverage.tsv")
file("${name}.depth.tsv")

script:
minimum_base_quality = params.collect_hs_metrics_min_base_quality ?
"--min-BQ ${params.collect_hs_metrics_min_base_quality}" : ""
minimum_mapping_quality = params.collect_hs_metrics_min_mapping_quality ?
"--min-MQ ${params.collect_hs_metrics_min_mapping_quality}" : ""
intervals = params.intervals_bed ? "-b ${params.intervals_bed}" : ""
"""
samtools coverage ${minimum_base_quality} ${minimum_mapping_quality} ${bam} > ${name}.coverage.tsv
samtools depth -s -d 0 -H ${intervals} ${bam} > ${name}.depth.tsv
"""
}
14 changes: 7 additions & 7 deletions modules/04_realignment_around_indels.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,18 @@ process REALIGNMENT_AROUND_INDELS {
cpus "${params.realignment_around_indels_cpus}"
memory "${params.realignment_around_indels_memory}"
tag "${name}"
publishDir "${params.output}/${name}", mode: "copy", pattern: "*.RA.intervals"
publishDir "${params.output}/${name}/metrics/realignment", mode: "copy", pattern: "*.RA.intervals"

// NOTE: this dependency is fixed to GATK 3 as the realignment around indels is not anymore maintained in GATK 4
// but still for some reason for GATK 3 to work the dependency to GATK 4 is needed
conda (params.enable_conda ? "bioconda::gatk4=4.2.0.0 bioconda::gatk=3.8" : null)

input:
tuple val(name), val(bam_name), val(type), file(bam), file(bai)
tuple val(name), val(type), file(bam), file(bai)

output:
tuple val(name), val(bam_name), val(type), file("${bam.baseName}.realigned.bam"), file("${bam.baseName}.realigned.bai"), emit: realigned_bams
file("${bam.baseName}.RA.intervals")
tuple val(name), val(type), file("${name}.realigned.bam"), file("${name}.realigned.bai"), emit: realigned_bams
file("${name}.RA.intervals")

script:
known_indels1 = params.known_indels1 ? " --known ${params.known_indels1}" : ""
Expand All @@ -33,14 +33,14 @@ process REALIGNMENT_AROUND_INDELS {
gatk3 -Xmx${params.realignment_around_indels_memory} -Djava.io.tmpdir=tmp -T RealignerTargetCreator \
--input_file ${bam} \
--out ${bam.baseName}.RA.intervals \
--out ${name}.RA.intervals \
--reference_sequence ${params.reference} ${known_indels1} ${known_indels2}
gatk3 -Xmx${params.realignment_around_indels_memory} -Djava.io.tmpdir=tmp -T IndelRealigner \
--input_file ${bam} \
--out ${bam.baseName}.realigned.bam \
--out ${name}.realigned.bam \
--reference_sequence ${params.reference} \
--targetIntervals ${bam.baseName}.RA.intervals \
--targetIntervals ${name}.RA.intervals \
--consensusDeterminationModel USE_SW \
--LODThresholdForCleaning 0.4 \
--maxReadsInMemory 600000 ${known_alleles1} ${known_alleles2}
Expand Down
26 changes: 13 additions & 13 deletions modules/05_bqsr.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,30 +14,30 @@ process BQSR {
conda (params.enable_conda ? "bioconda::gatk4=4.2.0.0" : null)

input:
tuple val(name), val(bam_name), val(type), file(bam), file(bai)
tuple val(name), val(type), file(bam), file(bai)

output:
tuple val("${name}"), val("${type}"), val("${params.output}/${name}/${bam_name}.preprocessed.bam"), emit: recalibrated_bams
file "${bam_name}.recalibration_report.grp"
file "${bam_name}.preprocessed.bam"
file "${bam_name}.preprocessed.bai"
file "${name}.recalibration_report.grp"
file "${name}.preprocessed.bam"
file "${name}.preprocessed.bai"

"""
mkdir tmp
gatk BaseRecalibrator \
--java-options '-Xmx${params.bqsr_memory} -Djava.io.tmpdir=tmp' \
--input ${bam} \
--output ${bam_name}.recalibration_report.grp \
--output ${name}.recalibration_report.grp \
--reference ${params.reference} \
--known-sites ${params.dbsnp}
gatk ApplyBQSR \
--java-options '-Xmx${params.bqsr_memory} -Djava.io.tmpdir=tmp' \
--input ${bam} \
--output ${bam_name}.preprocessed.bam \
--output ${name}.preprocessed.bam \
--reference ${params.reference} \
--bqsr-recal-file ${bam_name}.recalibration_report.grp
--bqsr-recal-file ${name}.recalibration_report.grp
"""
}

Expand All @@ -49,15 +49,15 @@ process CREATE_OUTPUT {
tag "${name}"

input:
tuple val(name), val(bam_name), val(type), file(bam), file(bai)
tuple val(name), val(type), file(bam), file(bai)

output:
tuple val("${name}"), val("${type}"), val("${params.output}/${name}/${bam_name}.preprocessed.bam"), emit: recalibrated_bams
file "${bam_name}.preprocessed.bam"
file "${bam_name}.preprocessed.bai"
tuple val("${name}"), val("${type}"), val("${params.output}/${name}/${name}.preprocessed.bam"), emit: recalibrated_bams
file "${name}.preprocessed.bam"
file "${name}.preprocessed.bai"

"""
cp ${bam} ${bam_name}.preprocessed.bam
cp ${bai} ${bam_name}.preprocessed.bai
cp ${bam} ${name}.preprocessed.bam
cp ${bai} ${name}.preprocessed.bai
"""
}
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ profiles {
params.known_indels1 = "$baseDir/test_data/1000G_phase1.indels.hg19.sites.minimal.vcf"
params.known_indels2 = "$baseDir/test_data/Mills_and_1000G_gold_standard.indels.hg19.sites.sorted.minimal.vcf"
params.intervals = "$baseDir/test_data/minimal_intervals.intervals"
params.intervals_bed = "$baseDir/test_data/minimal_intervals.bed"
params.dbsnp = "$baseDir/test_data/dbsnp_138.hg19.minimal.vcf"
timeline.enabled = false
report.enabled = false
Expand Down
8 changes: 4 additions & 4 deletions tests/test_01.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ source tests/assert.sh
output=output/test1
nextflow main.nf -profile test,conda --output $output

test -s $output/sample1/TESTX_S1_L001.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample1/TESTX_S1_L001.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
test -s $output/sample2/TESTX_S1_L002.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample2/TESTX_S1_L002.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
test -s $output/sample1/sample1.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample1/sample1.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
test -s $output/sample2/sample2.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample2/sample2.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
8 changes: 4 additions & 4 deletions tests/test_02.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ source tests/assert.sh
output=output/test2
nextflow main.nf -profile test,conda --output $output --skip_bqsr

test -s $output/sample1/TESTX_S1_L001.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample1/TESTX_S1_L001.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
test -s $output/sample2/TESTX_S1_L002.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample2/TESTX_S1_L002.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
test -s $output/sample1/sample1.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample1/sample1.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
test -s $output/sample2/sample2.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample2/sample2.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
8 changes: 4 additions & 4 deletions tests/test_03.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ source tests/assert.sh
output=output/test3
nextflow main.nf -profile test,conda --output $output --skip_realignment

test -s $output/sample1/TESTX_S1_L001.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample1/TESTX_S1_L001.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
test -s $output/sample2/TESTX_S1_L002.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample2/TESTX_S1_L002.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
test -s $output/sample1/sample1.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample1/sample1.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
test -s $output/sample2/sample2.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample2/sample2.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
8 changes: 4 additions & 4 deletions tests/test_04.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ source tests/assert.sh
output=output/test4
nextflow main.nf -profile test,conda --output $output --skip_deduplication

test -s $output/sample1/TESTX_S1_L001.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample1/TESTX_S1_L001.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
test -s $output/sample2/TESTX_S1_L002.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample2/TESTX_S1_L002.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
test -s $output/sample1/sample1.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample1/sample1.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
test -s $output/sample2/sample2.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample2/sample2.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
8 changes: 4 additions & 4 deletions tests/test_05.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ source tests/assert.sh
output=output/test5
nextflow main.nf -profile test,conda --output $output --skip_deduplication --skip_bqsr --skip_metrics --known_indels1 false --known_indels2 false

test -s $output/sample1/TESTX_S1_L001.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample1/TESTX_S1_L001.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
test -s $output/sample2/TESTX_S1_L002.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample2/TESTX_S1_L002.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
test -s $output/sample1/sample1.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample1/sample1.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
test -s $output/sample2/sample2.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample2/sample2.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
8 changes: 4 additions & 4 deletions tests/test_06.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ source tests/assert.sh
output=output/test6
nextflow main.nf -profile test,conda --output $output --intervals false --skip_deduplication --skip_bqsr --skip_realignment

test -s $output/sample1/TESTX_S1_L001.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample1/TESTX_S1_L001.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
test -s $output/sample2/TESTX_S1_L002.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample2/TESTX_S1_L002.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
test -s $output/sample1/sample1.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample1/sample1.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
test -s $output/sample2/sample2.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
test -s $output/sample2/sample2.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
Loading

0 comments on commit 233ab0b

Please sign in to comment.