From 7fc5a411ee7c09f2c93d19274a27f2e12a5f03ad Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 17 Oct 2022 22:54:41 +0200 Subject: [PATCH 1/9] add optional split cigar n reads step --- main.nf | 16 ++++++++---- modules/01_prepare_bam.nf | 6 ++--- modules/02_mark_duplicates.nf | 34 +++++++++++++++++++++++++ modules/03_metrics.nf | 4 +-- modules/04_realignment_around_indels.nf | 6 ++--- modules/05_bqsr.nf | 6 ++--- nextflow.config | 2 +- 7 files changed, 57 insertions(+), 17 deletions(-) diff --git a/main.nf b/main.nf index 8e517de..7c0b59a 100755 --- a/main.nf +++ b/main.nf @@ -3,7 +3,7 @@ nextflow.enable.dsl = 2 include { PREPARE_BAM; INDEX_BAM } from './modules/01_prepare_bam' -include { MARK_DUPLICATES } from './modules/02_mark_duplicates' +include { MARK_DUPLICATES; SPLIT_CIGAR_N_READS } from './modules/02_mark_duplicates' include { METRICS; HS_METRICS; COVERAGE_ANALYSIS } from './modules/03_metrics' include { REALIGNMENT_AROUND_INDELS } from './modules/04_realignment_around_indels' include { BQSR; CREATE_OUTPUT } from './modules/05_bqsr' @@ -26,6 +26,7 @@ params.output = 'output' params.platform = "ILLUMINA" params.collect_hs_metrics_min_base_quality = false params.collect_hs_metrics_min_mapping_quality = false +params.split_cigarn = false // computational resources params.prepare_bam_cpus = 3 @@ -84,7 +85,7 @@ else if (params.input_files) { workflow { - PREPARE_BAM(input_files) + PREPARE_BAM(input_files, params.reference) if (!params.skip_deduplication) { MARK_DUPLICATES(PREPARE_BAM.out.prepared_bams) @@ -95,16 +96,21 @@ workflow { deduplicated_bams = INDEX_BAM.out.indexed_bams } + if (params.split_cigarn) { + SPLIT_CIGAR_N_READS(deduplicated_bams, params.reference) + deduplicated_bams = SPLIT_CIGAR_N_READS.out.split_cigarn_bams + } + if (! params.skip_metrics) { if (params.intervals) { HS_METRICS(deduplicated_bams) } - METRICS(deduplicated_bams) + METRICS(deduplicated_bams, params.reference) COVERAGE_ANALYSIS(deduplicated_bams) } if (!params.skip_realignment) { - REALIGNMENT_AROUND_INDELS(deduplicated_bams) + REALIGNMENT_AROUND_INDELS(deduplicated_bams, params.reference) realigned_bams = REALIGNMENT_AROUND_INDELS.out.realigned_bams } else { @@ -112,7 +118,7 @@ workflow { } if (!params.skip_bqsr) { - BQSR(realigned_bams) + BQSR(realigned_bams, params.reference) preprocessed_bams = BQSR.out.recalibrated_bams } else { diff --git a/modules/01_prepare_bam.nf b/modules/01_prepare_bam.nf index c1b3c01..45a0d28 100644 --- a/modules/01_prepare_bam.nf +++ b/modules/01_prepare_bam.nf @@ -3,7 +3,6 @@ params.prepare_bam_memory = "8g" params.index_cpus = 1 params.index_memory = "8g" params.platform = "ILLUMINA" -params.reference = false params.skip_deduplication = false params.output = 'output' @@ -22,6 +21,7 @@ process PREPARE_BAM { input: tuple val(name), val(type), file(bam) + val(reference) output: tuple val(name), val(type), file("${name}.prepared.bam"), emit: prepared_bams @@ -41,7 +41,7 @@ process PREPARE_BAM { --VALIDATION_STRINGENCY SILENT \ --INPUT ${name}.sorted.bam \ --OUTPUT /dev/stdout \ - --REFERENCE_SEQUENCE ${params.reference} \ + --REFERENCE_SEQUENCE ${reference} \ --RGPU 1 \ --RGID 1 \ --RGSM ${type} \ @@ -55,7 +55,7 @@ process PREPARE_BAM { --java-options '-Xmx${params.prepare_bam_memory} -Djava.io.tmpdir=./tmp' \ --INPUT /dev/stdin \ --OUTPUT ${name}.prepared.bam \ - --SEQUENCE_DICTIONARY ${params.reference} + --SEQUENCE_DICTIONARY ${reference} rm -f ${name}.sorted.bam diff --git a/modules/02_mark_duplicates.nf b/modules/02_mark_duplicates.nf index 3e4307f..94097ee 100644 --- a/modules/02_mark_duplicates.nf +++ b/modules/02_mark_duplicates.nf @@ -49,3 +49,37 @@ process MARK_DUPLICATES { gatk --version >> software_versions.${task.process}.txt """ } + +process SPLIT_CIGAR_N_READS { + cpus "${params.prepare_bam_cpus}" + memory "${params.prepare_bam_memory}" + tag "${name}" + publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*" + + conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null) + + input: + tuple val(name), val(type), file(bam), file(bai) + val(reference) + + output: + tuple val(name), val(type), file("${name}.split_cigarn.bam"), file("${name}.split_cigarn.bam.bai"), emit: split_cigarn_bams + file("software_versions.${task.process}.txt") + + script: + """ + mkdir tmp + + gatk SplitNCigarReads \ + --java-options '-Xmx${params.prepare_bam_memory} -Djava.io.tmpdir=./tmp' \ + --INPUT ${bam} \ + --OUTPUT ${name}.split_cigarn.bam \ + --CREATE_INDEX true \ + --REFERENCE ${reference} + + cp ${name}.split_cigarn.bai ${name}.split_cigarn.bam.bai + + echo ${params.manifest} >> software_versions.${task.process}.txt + gatk --version >> software_versions.${task.process}.txt + """ +} diff --git a/modules/03_metrics.nf b/modules/03_metrics.nf index ed08e96..04dc1a8 100644 --- a/modules/03_metrics.nf +++ b/modules/03_metrics.nf @@ -2,7 +2,6 @@ params.metrics_cpus = 1 params.metrics_memory = "8g" params.collect_hs_metrics_min_base_quality = false params.collect_hs_metrics_min_mapping_quality = false -params.reference = false params.output = 'output' params.intervals = false @@ -63,6 +62,7 @@ process METRICS { input: tuple val(name), val(type), file(bam), file(bai) + val(reference) output: file("*_metrics") optional true @@ -76,7 +76,7 @@ process METRICS { --java-options '-Xmx${params.metrics_memory} -Djava.io.tmpdir=./tmp' \ --INPUT ${bam} \ --OUTPUT ${name} \ - --REFERENCE_SEQUENCE ${params.reference} \ + --REFERENCE_SEQUENCE ${reference} \ --PROGRAM QualityScoreDistribution \ --PROGRAM MeanQualityByCycle \ --PROGRAM CollectAlignmentSummaryMetrics \ diff --git a/modules/04_realignment_around_indels.nf b/modules/04_realignment_around_indels.nf index 15fa013..f517d94 100644 --- a/modules/04_realignment_around_indels.nf +++ b/modules/04_realignment_around_indels.nf @@ -2,7 +2,6 @@ params.realignment_around_indels_cpus = 2 params.realignment_around_indels_memory = "31g" params.known_indels1 = false params.known_indels2 = false -params.reference = false params.output = 'output' @@ -19,6 +18,7 @@ process REALIGNMENT_AROUND_INDELS { input: tuple val(name), val(type), file(bam), file(bai) + val(reference) output: tuple val(name), val(type), file("${name}.realigned.bam"), file("${name}.realigned.bai"), emit: realigned_bams @@ -36,12 +36,12 @@ process REALIGNMENT_AROUND_INDELS { gatk3 -Xmx${params.realignment_around_indels_memory} -Djava.io.tmpdir=./tmp -T RealignerTargetCreator \ --input_file ${bam} \ --out ${name}.RA.intervals \ - --reference_sequence ${params.reference} ${known_indels1} ${known_indels2} + --reference_sequence ${reference} ${known_indels1} ${known_indels2} gatk3 -Xmx${params.realignment_around_indels_memory} -Djava.io.tmpdir=./tmp -T IndelRealigner \ --input_file ${bam} \ --out ${name}.realigned.bam \ - --reference_sequence ${params.reference} \ + --reference_sequence ${reference} \ --targetIntervals ${name}.RA.intervals \ --consensusDeterminationModel USE_SW \ --LODThresholdForCleaning 0.4 \ diff --git a/modules/05_bqsr.nf b/modules/05_bqsr.nf index 0fa8405..ba94d75 100644 --- a/modules/05_bqsr.nf +++ b/modules/05_bqsr.nf @@ -1,6 +1,5 @@ params.bqsr_cpus = 3 params.bqsr_memory = "4g" -params.reference = false params.dbsnp = false params.output = 'output' @@ -16,6 +15,7 @@ process BQSR { input: tuple val(name), val(type), file(bam), file(bai) + val(reference) output: tuple val("${name}"), val("${type}"), val("${params.output}/${name}/${bam_name}.preprocessed.bam"), emit: recalibrated_bams @@ -31,14 +31,14 @@ process BQSR { --java-options '-Xmx${params.bqsr_memory} -Djava.io.tmpdir=./tmp' \ --input ${bam} \ --output ${name}.recalibration_report.grp \ - --reference ${params.reference} \ + --reference ${reference} \ --known-sites ${params.dbsnp} gatk ApplyBQSR \ --java-options '-Xmx${params.bqsr_memory} -Djava.io.tmpdir=./tmp' \ --input ${bam} \ --output ${name}.preprocessed.bam \ - --reference ${params.reference} \ + --reference ${reference} \ --bqsr-recal-file ${name}.recalibration_report.grp echo ${params.manifest} >> software_versions.${task.process}.txt diff --git a/nextflow.config b/nextflow.config index 5573d6f..c806cb0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -44,7 +44,7 @@ process.shell = ['/bin/bash', '-euo', 'pipefail'] cleanup = true -VERSION = '1.9.1' +VERSION = '2.0.0' DOI = 'https://zenodo.org/badge/latestdoi/358400957' manifest { From a878a188047466e7a0844a2e2f0c9bb8e00492ba Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 17 Oct 2022 23:51:22 +0200 Subject: [PATCH 2/9] fix split cigar n reads step and add test --- Makefile | 1 + modules/02_mark_duplicates.nf | 8 ++++---- tests/test_10.sh | 11 +++++++++++ 3 files changed, 16 insertions(+), 4 deletions(-) create mode 100644 tests/test_10.sh diff --git a/Makefile b/Makefile index 434cfd2..7eb699b 100644 --- a/Makefile +++ b/Makefile @@ -16,3 +16,4 @@ test: bash tests/test_07.sh bash tests/test_08.sh bash tests/test_09.sh + bash tests/test_10.sh diff --git a/modules/02_mark_duplicates.nf b/modules/02_mark_duplicates.nf index 94097ee..e74f990 100644 --- a/modules/02_mark_duplicates.nf +++ b/modules/02_mark_duplicates.nf @@ -72,10 +72,10 @@ process SPLIT_CIGAR_N_READS { gatk SplitNCigarReads \ --java-options '-Xmx${params.prepare_bam_memory} -Djava.io.tmpdir=./tmp' \ - --INPUT ${bam} \ - --OUTPUT ${name}.split_cigarn.bam \ - --CREATE_INDEX true \ - --REFERENCE ${reference} + --input ${bam} \ + --output ${name}.split_cigarn.bam \ + --create-output-bam-index true \ + --reference ${reference} cp ${name}.split_cigarn.bai ${name}.split_cigarn.bam.bai diff --git a/tests/test_10.sh b/tests/test_10.sh new file mode 100644 index 0000000..366617a --- /dev/null +++ b/tests/test_10.sh @@ -0,0 +1,11 @@ +#!/bin/bash + + +source tests/assert.sh +output=output/test10 +nextflow main.nf -profile test,conda --output $output --skip_realignment --split_cigarn + +test -s $output/sample1/sample1.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; } +test -s $output/sample1/sample1.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; } +test -s $output/sample2/sample2.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; } +test -s $output/sample2/sample2.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; } \ No newline at end of file From 8c87d7aba591b745fcedcbfa8100a5860163d0fa Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Tue, 18 Oct 2022 00:27:15 +0200 Subject: [PATCH 3/9] replace Picard MarkDuplicates by sambamba --- main.nf | 3 ++- modules/02_mark_duplicates.nf | 39 +++++++++++++++++------------------ modules/03_metrics.nf | 28 +++++++++++++++++++++++++ tests/test_08.sh | 2 +- tests/test_10.sh | 4 +++- 5 files changed, 53 insertions(+), 23 deletions(-) diff --git a/main.nf b/main.nf index 7c0b59a..d7a37c8 100755 --- a/main.nf +++ b/main.nf @@ -4,7 +4,7 @@ nextflow.enable.dsl = 2 include { PREPARE_BAM; INDEX_BAM } from './modules/01_prepare_bam' include { MARK_DUPLICATES; SPLIT_CIGAR_N_READS } from './modules/02_mark_duplicates' -include { METRICS; HS_METRICS; COVERAGE_ANALYSIS } from './modules/03_metrics' +include { METRICS; HS_METRICS; COVERAGE_ANALYSIS; FLAGSTAT } from './modules/03_metrics' include { REALIGNMENT_AROUND_INDELS } from './modules/04_realignment_around_indels' include { BQSR; CREATE_OUTPUT } from './modules/05_bqsr' @@ -107,6 +107,7 @@ workflow { } METRICS(deduplicated_bams, params.reference) COVERAGE_ANALYSIS(deduplicated_bams) + FLAGSTAT(deduplicated_bams) } if (!params.skip_realignment) { diff --git a/modules/02_mark_duplicates.nf b/modules/02_mark_duplicates.nf index e74f990..d953539 100644 --- a/modules/02_mark_duplicates.nf +++ b/modules/02_mark_duplicates.nf @@ -1,7 +1,6 @@ params.mark_duplicates_cpus = 2 params.mark_duplicates_memory = "16g" params.remove_duplicates = true -params.skip_metrics = false params.output = 'output' @@ -9,44 +8,44 @@ process MARK_DUPLICATES { cpus "${params.mark_duplicates_cpus}" memory "${params.mark_duplicates_memory}" tag "${name}" - publishDir "${params.output}/${name}/metrics/mark_duplicates", mode: "copy", pattern: "*.dedup_metrics.txt" publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*" - conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null) + conda (params.enable_conda ? "bioconda::sambamba=0.8.2" : null) input: tuple val(name), val(type), file(bam) output: tuple val(name), val(type), file("${name}.dedup.bam"), file("${name}.dedup.bam.bai"), emit: deduplicated_bams - file("${name}.dedup_metrics.txt") optional true file("software_versions.${task.process}.txt") script: - dedup_metrics = params.skip_metrics ? "": "--METRICS_FILE ${name}.dedup_metrics.txt" - remove_duplicates = params.remove_duplicates ? "--REMOVE_DUPLICATES true" : "--REMOVE_DUPLICATES false" + remove_duplicates_param = params.remove_duplicates ? "--remove-duplicates" : "" """ mkdir tmp - gatk SortSam \ - --java-options '-Xmx${params.mark_duplicates_memory} -Djava.io.tmpdir=./tmp' \ - --INPUT ${bam} \ - --OUTPUT ${name}.sorted.bam \ - --SORT_ORDER coordinate - - gatk MarkDuplicates \ - --java-options '-Xmx${params.mark_duplicates_memory} -Djava.io.tmpdir=./tmp' \ - --INPUT ${name}.sorted.bam \ - --OUTPUT ${name}.dedup.bam \ - --ASSUME_SORT_ORDER coordinate \ - --CREATE_INDEX true ${remove_duplicates} ${dedup_metrics} + # sort again + sambamba sort \ + --nthreads=${task.cpus} \ + --tmpdir=./tmp \ + --out=${name}.sorted.bam \ + ${bam} - cp ${name}.dedup.bai ${name}.dedup.bam.bai + # removes duplicates (sorted from the alignment process) + sambamba markdup ${remove_duplicates_param} \ + --nthreads=${task.cpus} \ + --tmpdir=./tmp \ + ${name}.sorted.bam ${name}.dedup.bam rm -f ${name}.sorted.bam + # indexes the output BAM file + sambamba index \ + --nthreads=${task.cpus} \ + ${name}.dedup.bam ${name}.dedup.bam.bai + echo ${params.manifest} >> software_versions.${task.process}.txt - gatk --version >> software_versions.${task.process}.txt + sambamba --version >> software_versions.${task.process}.txt """ } diff --git a/modules/03_metrics.nf b/modules/03_metrics.nf index 04dc1a8..ba3aef8 100644 --- a/modules/03_metrics.nf +++ b/modules/03_metrics.nf @@ -122,3 +122,31 @@ process COVERAGE_ANALYSIS { samtools --version >> software_versions.${task.process}.txt """ } + +process FLAGSTAT { + cpus "${params.metrics_cpus}" + memory "${params.metrics_memory}" + tag "${name}" + publishDir "${params.output}/${name}/metrics/flagstat", mode: "copy", pattern: "*.flagstat.csv" + publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*" + + conda (params.enable_conda ? "bioconda::sambamba=0.8.2" : null) + + input: + tuple val(name), val(type), file(bam) + + output: + file("${name}.flagstat.csv") + file("software_versions.${task.process}.txt") + + script: + """ + sambamba flagstat \ + --nthreads=${task.cpus} \ + --tabular \ + ${bam} > ${name}.flagstat.csv + + echo ${params.manifest} >> software_versions.${task.process}.txt + sambamba --version >> software_versions.${task.process}.txt + """ +} diff --git a/tests/test_08.sh b/tests/test_08.sh index 4337444..9d3f387 100644 --- a/tests/test_08.sh +++ b/tests/test_08.sh @@ -9,7 +9,7 @@ nextflow main.nf -profile test,conda --output $output --collect_hs_metrics_min_b test -s $output/sample1/sample1.preprocessed.bam || { echo "Missing BAM file!"; exit 1; } test -s $output/sample1/sample1.preprocessed.bai || { echo "Missing BAI file!"; exit 1; } test -s $output/sample1/metrics/hs_metrics/sample1.hs_metrics.txt || { echo "Missing HS metrics file!"; exit 1; } -test -s $output/sample1/metrics/mark_duplicates/sample1.dedup_metrics.txt || { echo "Missing dedup metrics file!"; exit 1; } +test -s $output/sample1/metrics/flagstat/sample1.flagstat.csv || { echo "Missing dedup metrics file!"; exit 1; } test -s $output/sample1/metrics/coverage/sample1.coverage.tsv || { echo "Missing horizontal coverage file!"; exit 1; } test -s $output/sample1/metrics/coverage/sample1.depth.tsv || { echo "Missing depth of coverage file!"; exit 1; } test -s $output/sample2/sample2.preprocessed.bam || { echo "Missing BAM file!"; exit 1; } diff --git a/tests/test_10.sh b/tests/test_10.sh index 366617a..4844e56 100644 --- a/tests/test_10.sh +++ b/tests/test_10.sh @@ -7,5 +7,7 @@ nextflow main.nf -profile test,conda --output $output --skip_realignment --split test -s $output/sample1/sample1.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; } test -s $output/sample1/sample1.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; } +test -s $output/sample1/metrics/flagstat/sample1.flagstat.csv || { echo "Missing output metrics file!"; exit 1; } test -s $output/sample2/sample2.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; } -test -s $output/sample2/sample2.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; } \ No newline at end of file +test -s $output/sample2/sample2.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; } +test -s $output/sample1/metrics/flagstat/sample2.flagstat.csv || { echo "Missing output metrics file!"; exit 1; } \ No newline at end of file From 17b2000113e3b67c5580d4a5c05108d7b4683b08 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Tue, 18 Oct 2022 00:36:45 +0200 Subject: [PATCH 4/9] remove unneeded sort operation --- modules/01_prepare_bam.nf | 12 ++---------- modules/03_metrics.nf | 2 +- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/modules/01_prepare_bam.nf b/modules/01_prepare_bam.nf index 45a0d28..f9e5194 100644 --- a/modules/01_prepare_bam.nf +++ b/modules/01_prepare_bam.nf @@ -3,7 +3,6 @@ params.prepare_bam_memory = "8g" params.index_cpus = 1 params.index_memory = "8g" params.platform = "ILLUMINA" -params.skip_deduplication = false params.output = 'output' /* @@ -28,25 +27,20 @@ process PREPARE_BAM { file("software_versions.${task.process}.txt") script: - order = params.skip_deduplication ? "--SORT_ORDER coordinate": "--SORT_ORDER queryname" """ mkdir tmp - samtools sort \ - --threads ${params.prepare_bam_cpus} \ - -o ${name}.sorted.bam ${bam} - gatk AddOrReplaceReadGroups \ --java-options '-Xmx${params.prepare_bam_memory} -Djava.io.tmpdir=./tmp' \ --VALIDATION_STRINGENCY SILENT \ - --INPUT ${name}.sorted.bam \ + --INPUT ${bam} \ --OUTPUT /dev/stdout \ --REFERENCE_SEQUENCE ${reference} \ --RGPU 1 \ --RGID 1 \ --RGSM ${type} \ --RGLB 1 \ - --RGPL ${params.platform} ${order} | \ + --RGPL ${params.platform} | \ gatk CleanSam \ --java-options '-Xmx${params.prepare_bam_memory} -Djava.io.tmpdir=./tmp' \ --INPUT /dev/stdin \ @@ -57,8 +51,6 @@ process PREPARE_BAM { --OUTPUT ${name}.prepared.bam \ --SEQUENCE_DICTIONARY ${reference} - rm -f ${name}.sorted.bam - echo ${params.manifest} >> software_versions.${task.process}.txt gatk --version >> software_versions.${task.process}.txt samtools --version >> software_versions.${task.process}.txt diff --git a/modules/03_metrics.nf b/modules/03_metrics.nf index ba3aef8..f279158 100644 --- a/modules/03_metrics.nf +++ b/modules/03_metrics.nf @@ -133,7 +133,7 @@ process FLAGSTAT { conda (params.enable_conda ? "bioconda::sambamba=0.8.2" : null) input: - tuple val(name), val(type), file(bam) + tuple val(name), val(type), file(bam), file(bai) output: file("${name}.flagstat.csv") From 8323a58df66f74e6ceb135dee15cf073a16f3b5b Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Tue, 18 Oct 2022 00:42:31 +0200 Subject: [PATCH 5/9] remove unneeded dependency to samtools --- modules/01_prepare_bam.nf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modules/01_prepare_bam.nf b/modules/01_prepare_bam.nf index f9e5194..e403fa6 100644 --- a/modules/01_prepare_bam.nf +++ b/modules/01_prepare_bam.nf @@ -16,7 +16,7 @@ process PREPARE_BAM { tag "${name}" publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*" - conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0 bioconda::samtools=1.12" : null) + conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null) input: tuple val(name), val(type), file(bam) @@ -53,7 +53,6 @@ process PREPARE_BAM { echo ${params.manifest} >> software_versions.${task.process}.txt gatk --version >> software_versions.${task.process}.txt - samtools --version >> software_versions.${task.process}.txt """ } From 737bed2caf18d1e893f5f8e7d923c36062baaee7 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Tue, 18 Oct 2022 07:44:11 +0200 Subject: [PATCH 6/9] fix nextflow version in tests --- .github/workflows/automated_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/automated_tests.yml b/.github/workflows/automated_tests.yml index 3b27f45..fde92fb 100644 --- a/.github/workflows/automated_tests.yml +++ b/.github/workflows/automated_tests.yml @@ -29,4 +29,5 @@ jobs: key: ${{ runner.os }}-tronflow-bam-preprocessing - name: Run tests run: | + export NXF_VER=22.04.5 make \ No newline at end of file From 7f9cea0f9675fccf61ece97e01bcd41a30f56d11 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo-Ferreiro Date: Tue, 18 Oct 2022 14:55:41 +0200 Subject: [PATCH 7/9] add a sort step when no mark duplicates is applied --- modules/01_prepare_bam.nf | 18 +++++++++++++----- modules/02_mark_duplicates.nf | 2 +- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/modules/01_prepare_bam.nf b/modules/01_prepare_bam.nf index e403fa6..f511399 100644 --- a/modules/01_prepare_bam.nf +++ b/modules/01_prepare_bam.nf @@ -62,7 +62,7 @@ process INDEX_BAM { tag "${name}" publishDir "${params.output}/${name}", mode: "copy", pattern: "software_versions.*" - conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null) + conda (params.enable_conda ? "bioconda::sambamba=0.8.2" : null) input: tuple val(name), val(type), file(bam) @@ -75,11 +75,19 @@ process INDEX_BAM { """ mkdir tmp - gatk BuildBamIndex \ - --java-options '-Xmx8g -Djava.io.tmpdir=./tmp' \ - --INPUT ${bam} + # sort + sambamba sort \ + --nthreads=${task.cpus} \ + --tmpdir=./tmp \ + --out=${name}.sorted.bam \ + ${bam} + + # indexes the output BAM file + sambamba index \ + --nthreads=${task.cpus} \ + ${name}.sorted.bam ${name}.sorted.bam.bai echo ${params.manifest} >> software_versions.${task.process}.txt - gatk --version >> software_versions.${task.process}.txt + sambamba --version >> software_versions.${task.process}.txt """ } diff --git a/modules/02_mark_duplicates.nf b/modules/02_mark_duplicates.nf index d953539..3be3305 100644 --- a/modules/02_mark_duplicates.nf +++ b/modules/02_mark_duplicates.nf @@ -24,7 +24,7 @@ process MARK_DUPLICATES { """ mkdir tmp - # sort again + # sort sambamba sort \ --nthreads=${task.cpus} \ --tmpdir=./tmp \ From e80acfa0862ba0fac45107a02c4db009b19c46aa Mon Sep 17 00:00:00 2001 From: Pablo Riesgo-Ferreiro Date: Tue, 18 Oct 2022 18:32:16 +0200 Subject: [PATCH 8/9] fix expected output indexing step --- modules/01_prepare_bam.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/01_prepare_bam.nf b/modules/01_prepare_bam.nf index f511399..d410496 100644 --- a/modules/01_prepare_bam.nf +++ b/modules/01_prepare_bam.nf @@ -68,7 +68,7 @@ process INDEX_BAM { tuple val(name), val(type), file(bam) output: - tuple val(name), val(type), file("${bam}"), file("${bam.baseName}.bai"), emit: indexed_bams + tuple val(name), val(type), file("${name}.sorted.bam"), file("${name}.sorted.bam.bai"), emit: indexed_bams file("software_versions.${task.process}.txt") script: From 7c514beaeb5930e45e6b97d275ff48d75aade8a6 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo-Ferreiro Date: Tue, 18 Oct 2022 20:50:55 +0200 Subject: [PATCH 9/9] fix tests --- tests/test_10.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_10.sh b/tests/test_10.sh index 4844e56..c0e57fc 100644 --- a/tests/test_10.sh +++ b/tests/test_10.sh @@ -10,4 +10,4 @@ test -s $output/sample1/sample1.preprocessed.bai || { echo "Missing output BAI f test -s $output/sample1/metrics/flagstat/sample1.flagstat.csv || { echo "Missing output metrics file!"; exit 1; } test -s $output/sample2/sample2.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; } test -s $output/sample2/sample2.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; } -test -s $output/sample1/metrics/flagstat/sample2.flagstat.csv || { echo "Missing output metrics file!"; exit 1; } \ No newline at end of file +test -s $output/sample2/metrics/flagstat/sample2.flagstat.csv || { echo "Missing output metrics file!"; exit 1; } \ No newline at end of file