From cab0df294d7845ce1a532124c1a1dc8eca0621ee Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 28 Feb 2022 10:17:27 +0100 Subject: [PATCH 1/4] upgrade GATK to 4.2.5.0 --- modules/01_prepare_bam.nf | 4 ++-- modules/02_mark_duplicates.nf | 2 +- modules/03_metrics.nf | 4 ++-- modules/04_realignment_around_indels.nf | 2 +- modules/05_bqsr.nf | 2 +- nextflow.config | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/modules/01_prepare_bam.nf b/modules/01_prepare_bam.nf index 0ab3a93..f134357 100644 --- a/modules/01_prepare_bam.nf +++ b/modules/01_prepare_bam.nf @@ -17,7 +17,7 @@ process PREPARE_BAM { memory "${params.prepare_bam_memory}" tag "${name}" - conda (params.enable_conda ? "bioconda::gatk4=4.2.0.0" : null) + conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null) input: tuple val(name), val(type), file(bam) @@ -58,7 +58,7 @@ process INDEX_BAM { memory "${params.index_memory}" tag "${name}" - conda (params.enable_conda ? "bioconda::gatk4=4.2.0.0" : null) + conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null) input: tuple val(name), val(type), file(bam) diff --git a/modules/02_mark_duplicates.nf b/modules/02_mark_duplicates.nf index 852f3ab..9dc42e1 100644 --- a/modules/02_mark_duplicates.nf +++ b/modules/02_mark_duplicates.nf @@ -11,7 +11,7 @@ process MARK_DUPLICATES { tag "${name}" publishDir "${params.output}/${name}/metrics/mark_duplicates", mode: "copy", pattern: "*.dedup_metrics.txt" - conda (params.enable_conda ? "bioconda::gatk4=4.2.0.0" : null) + conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null) input: tuple val(name), val(type), file(bam) diff --git a/modules/03_metrics.nf b/modules/03_metrics.nf index af1915b..43909fa 100644 --- a/modules/03_metrics.nf +++ b/modules/03_metrics.nf @@ -13,7 +13,7 @@ process HS_METRICS { tag "${name}" publishDir "${params.output}/${name}/metrics/hs_metrics", mode: "copy" - conda (params.enable_conda ? "bioconda::gatk4=4.2.0.0" : null) + conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null) input: tuple val(name), val(type), file(bam), file(bai) @@ -53,7 +53,7 @@ process METRICS { publishDir "${params.output}/${name}/metrics/gatk_multiple_metrics", mode: "copy" // NOTE: the method CollectMultipleMetrics has a hidden dependency to R for making plots - conda (params.enable_conda ? "bioconda::gatk4=4.2.0.0 r::r=3.6.0" : null) + conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0 r::r=3.6.0" : null) input: tuple val(name), val(type), file(bam), file(bai) diff --git a/modules/04_realignment_around_indels.nf b/modules/04_realignment_around_indels.nf index 712db9a..2735e86 100644 --- a/modules/04_realignment_around_indels.nf +++ b/modules/04_realignment_around_indels.nf @@ -14,7 +14,7 @@ process REALIGNMENT_AROUND_INDELS { // NOTE: this dependency is fixed to GATK 3 as the realignment around indels is not anymore maintained in GATK 4 // but still for some reason for GATK 3 to work the dependency to GATK 4 is needed - conda (params.enable_conda ? "bioconda::gatk4=4.2.0.0 bioconda::gatk=3.8" : null) + conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0 bioconda::gatk=3.8" : null) input: tuple val(name), val(type), file(bam), file(bai) diff --git a/modules/05_bqsr.nf b/modules/05_bqsr.nf index 391277c..c893929 100644 --- a/modules/05_bqsr.nf +++ b/modules/05_bqsr.nf @@ -11,7 +11,7 @@ process BQSR { publishDir "${params.output}/${name}", mode: "copy" tag "${name}" - conda (params.enable_conda ? "bioconda::gatk4=4.2.0.0" : null) + conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null) input: tuple val(name), val(type), file(bam), file(bai) diff --git a/nextflow.config b/nextflow.config index 16ebdd1..8cf89d7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -46,7 +46,7 @@ process.shell = ['/bin/bash', '-euo', 'pipefail'] cleanup = true -VERSION = '1.7.3' +VERSION = '1.8.0' DOI = 'https://zenodo.org/badge/latestdoi/358400957' manifest { From cf7d7186476ce4ceefb3cfc03be25aa42136f311 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 28 Feb 2022 11:05:58 +0100 Subject: [PATCH 2/4] change MarkDuplicatesSpark to MarkDuplicates --- modules/02_mark_duplicates.nf | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/modules/02_mark_duplicates.nf b/modules/02_mark_duplicates.nf index 9dc42e1..9ff86ea 100644 --- a/modules/02_mark_duplicates.nf +++ b/modules/02_mark_duplicates.nf @@ -1,5 +1,5 @@ -params.mark_duplicates_cpus = 16 -params.mark_duplicates_memory = "64g" +params.mark_duplicates_cpus = 2 +params.mark_duplicates_memory = "16g" params.remove_duplicates = true params.skip_metrics = false params.output = 'output' @@ -21,15 +21,25 @@ process MARK_DUPLICATES { file("${name}.dedup_metrics.txt") optional true script: - dedup_metrics = params.skip_metrics ? "": "--metrics-file ${name}.dedup_metrics.txt" - remove_duplicates = params.remove_duplicates ? "--remove-all-duplicates true" : "--remove-all-duplicates false" + dedup_metrics = params.skip_metrics ? "": "--METRICS_FILE ${name}.dedup_metrics.txt" + remove_duplicates = params.remove_duplicates ? "--REMOVE_DUPLICATES true" : "--REMOVE_DUPLICATES false" """ mkdir tmp - gatk MarkDuplicatesSpark \ + gatk SortSam \ + --INPUT ${bam} \ + --OUTPUT ${name}.sorted.bam \ + --SORT_ORDER coordinate + + gatk MarkDuplicates \ --java-options '-Xmx${params.mark_duplicates_memory} -Djava.io.tmpdir=tmp' \ - --input ${bam} \ - --output ${name}.dedup.bam \ - --conf 'spark.executor.cores=${task.cpus}' ${remove_duplicates} ${dedup_metrics} + --INPUT ${name}.sorted.bam \ + --OUTPUT ${name}.dedup.bam \ + --ASSUME_SORT_ORDER coordinate \ + --CREATE_INDEX true ${remove_duplicates} ${dedup_metrics} + + cp ${name}.dedup.bai ${name}.dedup.bam.bai + + rm -f ${name}.sorted.bam """ } From 2a0f1e66f2bf1612ec96df3909c4143a430f22db Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 28 Feb 2022 11:07:20 +0100 Subject: [PATCH 3/4] update defaults for memory --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index fab5149..8e517de 100755 --- a/main.nf +++ b/main.nf @@ -30,8 +30,8 @@ params.collect_hs_metrics_min_mapping_quality = false // computational resources params.prepare_bam_cpus = 3 params.prepare_bam_memory = "8g" -params.mark_duplicates_cpus = 16 -params.mark_duplicates_memory = "64g" +params.mark_duplicates_cpus = 2 +params.mark_duplicates_memory = "16g" params.realignment_around_indels_cpus = 2 params.realignment_around_indels_memory = "31g" params.bqsr_cpus = 3 From a2384876f03a95f0feea7edbff46430ecc475269 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Mon, 28 Feb 2022 11:19:39 +0100 Subject: [PATCH 4/4] try downgrading GATK for realignment around indels --- modules/04_realignment_around_indels.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/04_realignment_around_indels.nf b/modules/04_realignment_around_indels.nf index 2735e86..1f86288 100644 --- a/modules/04_realignment_around_indels.nf +++ b/modules/04_realignment_around_indels.nf @@ -13,8 +13,8 @@ process REALIGNMENT_AROUND_INDELS { publishDir "${params.output}/${name}/metrics/realignment", mode: "copy", pattern: "*.RA.intervals" // NOTE: this dependency is fixed to GATK 3 as the realignment around indels is not anymore maintained in GATK 4 - // but still for some reason for GATK 3 to work the dependency to GATK 4 is needed - conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0 bioconda::gatk=3.8" : null) + // but still for some reason for GATK 3 to work the dependency to GATK 4.2.0.0 is needed + conda (params.enable_conda ? "bioconda::gatk4=4.2.0.0 bioconda::gatk=3.8" : null) input: tuple val(name), val(type), file(bam), file(bai)