diff --git a/.github/workflows/automated_tests.yml b/.github/workflows/automated_tests.yml index 3b27f45..fde92fb 100644 --- a/.github/workflows/automated_tests.yml +++ b/.github/workflows/automated_tests.yml @@ -29,4 +29,5 @@ jobs: key: ${{ runner.os }}-tronflow-bam-preprocessing - name: Run tests run: | + export NXF_VER=22.04.5 make \ No newline at end of file diff --git a/Makefile b/Makefile index 434cfd2..7eb699b 100644 --- a/Makefile +++ b/Makefile @@ -16,3 +16,4 @@ test: bash tests/test_07.sh bash tests/test_08.sh bash tests/test_09.sh + bash tests/test_10.sh diff --git a/main.nf b/main.nf index 8e517de..d7a37c8 100755 --- a/main.nf +++ b/main.nf @@ -3,8 +3,8 @@ nextflow.enable.dsl = 2 include { PREPARE_BAM; INDEX_BAM } from './modules/01_prepare_bam' -include { MARK_DUPLICATES } from './modules/02_mark_duplicates' -include { METRICS; HS_METRICS; COVERAGE_ANALYSIS } from './modules/03_metrics' +include { MARK_DUPLICATES; SPLIT_CIGAR_N_READS } from './modules/02_mark_duplicates' +include { METRICS; HS_METRICS; COVERAGE_ANALYSIS; FLAGSTAT } from './modules/03_metrics' include { REALIGNMENT_AROUND_INDELS } from './modules/04_realignment_around_indels' include { BQSR; CREATE_OUTPUT } from './modules/05_bqsr' @@ -26,6 +26,7 @@ params.output = 'output' params.platform = "ILLUMINA" params.collect_hs_metrics_min_base_quality = false params.collect_hs_metrics_min_mapping_quality = false +params.split_cigarn = false // computational resources params.prepare_bam_cpus = 3 @@ -84,7 +85,7 @@ else if (params.input_files) { workflow { - PREPARE_BAM(input_files) + PREPARE_BAM(input_files, params.reference) if (!params.skip_deduplication) { MARK_DUPLICATES(PREPARE_BAM.out.prepared_bams) @@ -95,16 +96,22 @@ workflow { deduplicated_bams = INDEX_BAM.out.indexed_bams } + if (params.split_cigarn) { + SPLIT_CIGAR_N_READS(deduplicated_bams, params.reference) + deduplicated_bams = SPLIT_CIGAR_N_READS.out.split_cigarn_bams + } + if (! params.skip_metrics) { if (params.intervals) { HS_METRICS(deduplicated_bams) } - METRICS(deduplicated_bams) + METRICS(deduplicated_bams, params.reference) COVERAGE_ANALYSIS(deduplicated_bams) + FLAGSTAT(deduplicated_bams) } if (!params.skip_realignment) { - REALIGNMENT_AROUND_INDELS(deduplicated_bams) + REALIGNMENT_AROUND_INDELS(deduplicated_bams, params.reference) realigned_bams = REALIGNMENT_AROUND_INDELS.out.realigned_bams } else { @@ -112,7 +119,7 @@ workflow { } if (!params.skip_bqsr) { - BQSR(realigned_bams) + BQSR(realigned_bams, params.reference) preprocessed_bams = BQSR.out.recalibrated_bams } else { diff --git a/modules/01_prepare_bam.nf b/modules/01_prepare_bam.nf index c1b3c01..d410496 100644 --- a/modules/01_prepare_bam.nf +++ b/modules/01_prepare_bam.nf @@ -3,8 +3,6 @@ params.prepare_bam_memory = "8g" params.index_cpus = 1 params.index_memory = "8g" params.platform = "ILLUMINA" -params.reference = false -params.skip_deduplication = false params.output = 'output' /* @@ -18,35 +16,31 @@ process PREPARE_BAM { tag "${name}" publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*" - conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0 bioconda::samtools=1.12" : null) + conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null) input: tuple val(name), val(type), file(bam) + val(reference) output: tuple val(name), val(type), file("${name}.prepared.bam"), emit: prepared_bams file("software_versions.${task.process}.txt") script: - order = params.skip_deduplication ? "--SORT_ORDER coordinate": "--SORT_ORDER queryname" """ mkdir tmp - samtools sort \ - --threads ${params.prepare_bam_cpus} \ - -o ${name}.sorted.bam ${bam} - gatk AddOrReplaceReadGroups \ --java-options '-Xmx${params.prepare_bam_memory} -Djava.io.tmpdir=./tmp' \ --VALIDATION_STRINGENCY SILENT \ - --INPUT ${name}.sorted.bam \ + --INPUT ${bam} \ --OUTPUT /dev/stdout \ - --REFERENCE_SEQUENCE ${params.reference} \ + --REFERENCE_SEQUENCE ${reference} \ --RGPU 1 \ --RGID 1 \ --RGSM ${type} \ --RGLB 1 \ - --RGPL ${params.platform} ${order} | \ + --RGPL ${params.platform} | \ gatk CleanSam \ --java-options '-Xmx${params.prepare_bam_memory} -Djava.io.tmpdir=./tmp' \ --INPUT /dev/stdin \ @@ -55,13 +49,10 @@ process PREPARE_BAM { --java-options '-Xmx${params.prepare_bam_memory} -Djava.io.tmpdir=./tmp' \ --INPUT /dev/stdin \ --OUTPUT ${name}.prepared.bam \ - --SEQUENCE_DICTIONARY ${params.reference} - - rm -f ${name}.sorted.bam + --SEQUENCE_DICTIONARY ${reference} echo ${params.manifest} >> software_versions.${task.process}.txt gatk --version >> software_versions.${task.process}.txt - samtools --version >> software_versions.${task.process}.txt """ } @@ -71,24 +62,32 @@ process INDEX_BAM { tag "${name}" publishDir "${params.output}/${name}", mode: "copy", pattern: "software_versions.*" - conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null) + conda (params.enable_conda ? "bioconda::sambamba=0.8.2" : null) input: tuple val(name), val(type), file(bam) output: - tuple val(name), val(type), file("${bam}"), file("${bam.baseName}.bai"), emit: indexed_bams + tuple val(name), val(type), file("${name}.sorted.bam"), file("${name}.sorted.bam.bai"), emit: indexed_bams file("software_versions.${task.process}.txt") script: """ mkdir tmp - gatk BuildBamIndex \ - --java-options '-Xmx8g -Djava.io.tmpdir=./tmp' \ - --INPUT ${bam} + # sort + sambamba sort \ + --nthreads=${task.cpus} \ + --tmpdir=./tmp \ + --out=${name}.sorted.bam \ + ${bam} + + # indexes the output BAM file + sambamba index \ + --nthreads=${task.cpus} \ + ${name}.sorted.bam ${name}.sorted.bam.bai echo ${params.manifest} >> software_versions.${task.process}.txt - gatk --version >> software_versions.${task.process}.txt + sambamba --version >> software_versions.${task.process}.txt """ } diff --git a/modules/02_mark_duplicates.nf b/modules/02_mark_duplicates.nf index 3e4307f..3be3305 100644 --- a/modules/02_mark_duplicates.nf +++ b/modules/02_mark_duplicates.nf @@ -1,7 +1,6 @@ params.mark_duplicates_cpus = 2 params.mark_duplicates_memory = "16g" params.remove_duplicates = true -params.skip_metrics = false params.output = 'output' @@ -9,42 +8,76 @@ process MARK_DUPLICATES { cpus "${params.mark_duplicates_cpus}" memory "${params.mark_duplicates_memory}" tag "${name}" - publishDir "${params.output}/${name}/metrics/mark_duplicates", mode: "copy", pattern: "*.dedup_metrics.txt" publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*" - conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null) + conda (params.enable_conda ? "bioconda::sambamba=0.8.2" : null) input: tuple val(name), val(type), file(bam) output: tuple val(name), val(type), file("${name}.dedup.bam"), file("${name}.dedup.bam.bai"), emit: deduplicated_bams - file("${name}.dedup_metrics.txt") optional true file("software_versions.${task.process}.txt") script: - dedup_metrics = params.skip_metrics ? "": "--METRICS_FILE ${name}.dedup_metrics.txt" - remove_duplicates = params.remove_duplicates ? "--REMOVE_DUPLICATES true" : "--REMOVE_DUPLICATES false" + remove_duplicates_param = params.remove_duplicates ? "--remove-duplicates" : "" """ mkdir tmp - gatk SortSam \ - --java-options '-Xmx${params.mark_duplicates_memory} -Djava.io.tmpdir=./tmp' \ - --INPUT ${bam} \ - --OUTPUT ${name}.sorted.bam \ - --SORT_ORDER coordinate - - gatk MarkDuplicates \ - --java-options '-Xmx${params.mark_duplicates_memory} -Djava.io.tmpdir=./tmp' \ - --INPUT ${name}.sorted.bam \ - --OUTPUT ${name}.dedup.bam \ - --ASSUME_SORT_ORDER coordinate \ - --CREATE_INDEX true ${remove_duplicates} ${dedup_metrics} + # sort + sambamba sort \ + --nthreads=${task.cpus} \ + --tmpdir=./tmp \ + --out=${name}.sorted.bam \ + ${bam} - cp ${name}.dedup.bai ${name}.dedup.bam.bai + # removes duplicates (sorted from the alignment process) + sambamba markdup ${remove_duplicates_param} \ + --nthreads=${task.cpus} \ + --tmpdir=./tmp \ + ${name}.sorted.bam ${name}.dedup.bam rm -f ${name}.sorted.bam + # indexes the output BAM file + sambamba index \ + --nthreads=${task.cpus} \ + ${name}.dedup.bam ${name}.dedup.bam.bai + + echo ${params.manifest} >> software_versions.${task.process}.txt + sambamba --version >> software_versions.${task.process}.txt + """ +} + +process SPLIT_CIGAR_N_READS { + cpus "${params.prepare_bam_cpus}" + memory "${params.prepare_bam_memory}" + tag "${name}" + publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*" + + conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null) + + input: + tuple val(name), val(type), file(bam), file(bai) + val(reference) + + output: + tuple val(name), val(type), file("${name}.split_cigarn.bam"), file("${name}.split_cigarn.bam.bai"), emit: split_cigarn_bams + file("software_versions.${task.process}.txt") + + script: + """ + mkdir tmp + + gatk SplitNCigarReads \ + --java-options '-Xmx${params.prepare_bam_memory} -Djava.io.tmpdir=./tmp' \ + --input ${bam} \ + --output ${name}.split_cigarn.bam \ + --create-output-bam-index true \ + --reference ${reference} + + cp ${name}.split_cigarn.bai ${name}.split_cigarn.bam.bai + echo ${params.manifest} >> software_versions.${task.process}.txt gatk --version >> software_versions.${task.process}.txt """ diff --git a/modules/03_metrics.nf b/modules/03_metrics.nf index ed08e96..f279158 100644 --- a/modules/03_metrics.nf +++ b/modules/03_metrics.nf @@ -2,7 +2,6 @@ params.metrics_cpus = 1 params.metrics_memory = "8g" params.collect_hs_metrics_min_base_quality = false params.collect_hs_metrics_min_mapping_quality = false -params.reference = false params.output = 'output' params.intervals = false @@ -63,6 +62,7 @@ process METRICS { input: tuple val(name), val(type), file(bam), file(bai) + val(reference) output: file("*_metrics") optional true @@ -76,7 +76,7 @@ process METRICS { --java-options '-Xmx${params.metrics_memory} -Djava.io.tmpdir=./tmp' \ --INPUT ${bam} \ --OUTPUT ${name} \ - --REFERENCE_SEQUENCE ${params.reference} \ + --REFERENCE_SEQUENCE ${reference} \ --PROGRAM QualityScoreDistribution \ --PROGRAM MeanQualityByCycle \ --PROGRAM CollectAlignmentSummaryMetrics \ @@ -122,3 +122,31 @@ process COVERAGE_ANALYSIS { samtools --version >> software_versions.${task.process}.txt """ } + +process FLAGSTAT { + cpus "${params.metrics_cpus}" + memory "${params.metrics_memory}" + tag "${name}" + publishDir "${params.output}/${name}/metrics/flagstat", mode: "copy", pattern: "*.flagstat.csv" + publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*" + + conda (params.enable_conda ? "bioconda::sambamba=0.8.2" : null) + + input: + tuple val(name), val(type), file(bam), file(bai) + + output: + file("${name}.flagstat.csv") + file("software_versions.${task.process}.txt") + + script: + """ + sambamba flagstat \ + --nthreads=${task.cpus} \ + --tabular \ + ${bam} > ${name}.flagstat.csv + + echo ${params.manifest} >> software_versions.${task.process}.txt + sambamba --version >> software_versions.${task.process}.txt + """ +} diff --git a/modules/04_realignment_around_indels.nf b/modules/04_realignment_around_indels.nf index 15fa013..f517d94 100644 --- a/modules/04_realignment_around_indels.nf +++ b/modules/04_realignment_around_indels.nf @@ -2,7 +2,6 @@ params.realignment_around_indels_cpus = 2 params.realignment_around_indels_memory = "31g" params.known_indels1 = false params.known_indels2 = false -params.reference = false params.output = 'output' @@ -19,6 +18,7 @@ process REALIGNMENT_AROUND_INDELS { input: tuple val(name), val(type), file(bam), file(bai) + val(reference) output: tuple val(name), val(type), file("${name}.realigned.bam"), file("${name}.realigned.bai"), emit: realigned_bams @@ -36,12 +36,12 @@ process REALIGNMENT_AROUND_INDELS { gatk3 -Xmx${params.realignment_around_indels_memory} -Djava.io.tmpdir=./tmp -T RealignerTargetCreator \ --input_file ${bam} \ --out ${name}.RA.intervals \ - --reference_sequence ${params.reference} ${known_indels1} ${known_indels2} + --reference_sequence ${reference} ${known_indels1} ${known_indels2} gatk3 -Xmx${params.realignment_around_indels_memory} -Djava.io.tmpdir=./tmp -T IndelRealigner \ --input_file ${bam} \ --out ${name}.realigned.bam \ - --reference_sequence ${params.reference} \ + --reference_sequence ${reference} \ --targetIntervals ${name}.RA.intervals \ --consensusDeterminationModel USE_SW \ --LODThresholdForCleaning 0.4 \ diff --git a/modules/05_bqsr.nf b/modules/05_bqsr.nf index 0fa8405..ba94d75 100644 --- a/modules/05_bqsr.nf +++ b/modules/05_bqsr.nf @@ -1,6 +1,5 @@ params.bqsr_cpus = 3 params.bqsr_memory = "4g" -params.reference = false params.dbsnp = false params.output = 'output' @@ -16,6 +15,7 @@ process BQSR { input: tuple val(name), val(type), file(bam), file(bai) + val(reference) output: tuple val("${name}"), val("${type}"), val("${params.output}/${name}/${bam_name}.preprocessed.bam"), emit: recalibrated_bams @@ -31,14 +31,14 @@ process BQSR { --java-options '-Xmx${params.bqsr_memory} -Djava.io.tmpdir=./tmp' \ --input ${bam} \ --output ${name}.recalibration_report.grp \ - --reference ${params.reference} \ + --reference ${reference} \ --known-sites ${params.dbsnp} gatk ApplyBQSR \ --java-options '-Xmx${params.bqsr_memory} -Djava.io.tmpdir=./tmp' \ --input ${bam} \ --output ${name}.preprocessed.bam \ - --reference ${params.reference} \ + --reference ${reference} \ --bqsr-recal-file ${name}.recalibration_report.grp echo ${params.manifest} >> software_versions.${task.process}.txt diff --git a/nextflow.config b/nextflow.config index 5573d6f..c806cb0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -44,7 +44,7 @@ process.shell = ['/bin/bash', '-euo', 'pipefail'] cleanup = true -VERSION = '1.9.1' +VERSION = '2.0.0' DOI = 'https://zenodo.org/badge/latestdoi/358400957' manifest { diff --git a/tests/test_08.sh b/tests/test_08.sh index 4337444..9d3f387 100644 --- a/tests/test_08.sh +++ b/tests/test_08.sh @@ -9,7 +9,7 @@ nextflow main.nf -profile test,conda --output $output --collect_hs_metrics_min_b test -s $output/sample1/sample1.preprocessed.bam || { echo "Missing BAM file!"; exit 1; } test -s $output/sample1/sample1.preprocessed.bai || { echo "Missing BAI file!"; exit 1; } test -s $output/sample1/metrics/hs_metrics/sample1.hs_metrics.txt || { echo "Missing HS metrics file!"; exit 1; } -test -s $output/sample1/metrics/mark_duplicates/sample1.dedup_metrics.txt || { echo "Missing dedup metrics file!"; exit 1; } +test -s $output/sample1/metrics/flagstat/sample1.flagstat.csv || { echo "Missing dedup metrics file!"; exit 1; } test -s $output/sample1/metrics/coverage/sample1.coverage.tsv || { echo "Missing horizontal coverage file!"; exit 1; } test -s $output/sample1/metrics/coverage/sample1.depth.tsv || { echo "Missing depth of coverage file!"; exit 1; } test -s $output/sample2/sample2.preprocessed.bam || { echo "Missing BAM file!"; exit 1; } diff --git a/tests/test_10.sh b/tests/test_10.sh new file mode 100644 index 0000000..c0e57fc --- /dev/null +++ b/tests/test_10.sh @@ -0,0 +1,13 @@ +#!/bin/bash + + +source tests/assert.sh +output=output/test10 +nextflow main.nf -profile test,conda --output $output --skip_realignment --split_cigarn + +test -s $output/sample1/sample1.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; } +test -s $output/sample1/sample1.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; } +test -s $output/sample1/metrics/flagstat/sample1.flagstat.csv || { echo "Missing output metrics file!"; exit 1; } +test -s $output/sample2/sample2.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; } +test -s $output/sample2/sample2.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; } +test -s $output/sample2/metrics/flagstat/sample2.flagstat.csv || { echo "Missing output metrics file!"; exit 1; } \ No newline at end of file